From 260269c3928297165550d102c927f42ac5f81776 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 19 Oct 2023 13:43:40 -0500
Subject: [PATCH 1/7] rename workflow repository (#14300)

---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 ci/release/update-version.sh |  2 +-
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0e120d34bb1..666d8844a80 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 054ea7968c8..cb9d67639ef 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 030f2e41db4..1ba0a7491a8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 5e735a71994..eac64fe1a0f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -113,7 +113,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh

From f7ad66f440168fd4eceb3cc900301661023e42a1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 09:31:16 -0700
Subject: [PATCH 2/7] Add DELTA_BINARY_PACKED encoder for Parquet writer
 (#14100)

Part of #13501. Adds ability to fall back on DELTA_BINARY_PACKED encoding when V2 page headers are selected and dictionary encoding is not possible.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14100
---
 cpp/src/io/parquet/delta_binary.cuh |   6 -
 cpp/src/io/parquet/delta_enc.cuh    | 290 ++++++++
 cpp/src/io/parquet/page_enc.cu      | 998 ++++++++++++++++++++--------
 cpp/src/io/parquet/parquet_gpu.hpp  |  51 +-
 cpp/tests/io/parquet_test.cpp       | 108 ++-
 5 files changed, 1156 insertions(+), 297 deletions(-)
 create mode 100644 cpp/src/io/parquet/delta_enc.cuh

diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index a513e6674b4..e3b23f4c0a0 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -46,12 +46,6 @@ namespace cudf::io::parquet::detail {
 // encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
 // lengths, followed by the concatenated suffix data.
 
-// TODO: The delta encodings use ULEB128 integers, but for now we're only
-// using max 64 bits. Need to see what the performance impact is of using
-// __int128_t rather than int64_t.
-using uleb128_t   = uint64_t;
-using zigzag128_t = int64_t;
-
 // we decode one mini-block at a time. max mini-block size seen is 64.
 constexpr int delta_rolling_buf_size = 128;
 
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
new file mode 100644
index 00000000000..28f8cdfe2c1
--- /dev/null
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf::io::parquet::detail {
+
+namespace delta {
+
+inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v)
+{
+  while (v > 0x7f) {
+    *(p++) = v | 0x80;
+    v >>= 7;
+  }
+  *(p++) = v;
+}
+
+inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v)
+{
+  zigzag128_t s = (v < 0);
+  put_uleb128(p, (v ^ -s) * 2 + s);
+}
+
+// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming
+// too much shared memory.
+// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block
+// to be a multiple of 32.
+constexpr int block_size            = 128;
+constexpr int num_mini_blocks       = 4;
+constexpr int values_per_mini_block = block_size / num_mini_blocks;
+constexpr int buffer_size           = 2 * block_size;
+
+// An extra sanity checks to enforce compliance with the parquet specification.
+static_assert(block_size % 128 == 0);
+static_assert(values_per_mini_block % 32 == 0);
+
+using block_reduce = cub::BlockReduce<zigzag128_t, block_size>;
+using warp_reduce  = cub::WarpReduce<uleb128_t>;
+using index_scan   = cub::BlockScan<size_type, block_size>;
+
+constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+
+// Version of bit packer that can handle up to 64 bits values.
+// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
+// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a
+// measurable impact to using the wider types.
+template <typename scratch_type>
+inline __device__ void bitpack_mini_block(
+  uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
+{
+  using wide_type =
+    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using cudf::detail::warp_size;
+  scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
+  auto constexpr div          = sizeof(scratch_type) * 8;
+
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+
+  auto const scratch = reinterpret_cast<scratch_type*>(temp_space) + warp_id * warp_size;
+
+  // zero out scratch
+  scratch[lane_id] = 0;
+  __syncwarp();
+
+  // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...)
+  // like what's done for the RLE encoder.
+  if (nbits == div) {
+    if (lane_id < count) {
+      for (int i = 0; i < sizeof(scratch_type); i++) {
+        dst[lane_id * sizeof(scratch_type) + i] = val & 0xff;
+        val >>= 8;
+      }
+    }
+    return;
+  }
+
+  if (lane_id <= count) {
+    // Shift symbol left by up to mask bits.
+    wide_type v2 = val;
+    v2 <<= (lane_id * nbits) & mask;
+
+    // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules.
+    scratch_type v1[2];
+    memcpy(&v1, &v2, sizeof(wide_type));
+
+    // Atomically write result to scratch.
+    if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); }
+    if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); }
+  }
+  __syncwarp();
+
+  // Copy scratch data to final destination.
+  auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U);
+  auto const scratch_bytes   = reinterpret_cast<uint8_t const*>(scratch);
+
+  for (uint32_t i = lane_id; i < available_bytes; i += warp_size) {
+    dst[i] = scratch_bytes[i];
+  }
+  __syncwarp();
+}
+
+}  // namespace delta
+
+// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input
+// 128 values with validity at a time, saving them until there are enough values for a block
+// to be written.
+// T is the input data type (either zigzag128_t or uleb128_t).
+template <typename T>
+class delta_binary_packer {
+ private:
+  uint8_t* _dst;                             // sink to dump encoded values to
+  T* _buffer;                                // buffer to store values to be encoded
+  size_type _current_idx;                    // index of first value in buffer
+  uint32_t _num_values;                      // total number of values to encode
+  size_type _values_in_buffer;               // current number of values stored in _buffer
+  uint8_t _mb_bits[delta::num_mini_blocks];  // bitwidth for each mini-block
+
+  // pointers to shared scratch memory for the warp and block scans/reduces
+  delta::index_scan::TempStorage* _scan_tmp;
+  delta::warp_reduce::TempStorage* _warp_tmp;
+  delta::block_reduce::TempStorage* _block_tmp;
+
+  void* _bitpack_tmp;  // pointer to shared scratch memory used in bitpacking
+
+  // Write the delta binary header. Only call from thread 0.
+  inline __device__ void write_header()
+  {
+    delta::put_uleb128(_dst, delta::block_size);
+    delta::put_uleb128(_dst, delta::num_mini_blocks);
+    delta::put_uleb128(_dst, _num_values);
+    delta::put_zz128(_dst, _buffer[0]);
+  }
+
+  // Write the block header. Only call from thread 0.
+  inline __device__ void write_block_header(zigzag128_t block_min)
+  {
+    delta::put_zz128(_dst, block_min);
+    memcpy(_dst, _mb_bits, 4);
+    _dst += 4;
+  }
+
+  // Signed subtraction with defined wrapping behavior.
+  inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b)
+  {
+    return static_cast<zigzag128_t>(static_cast<uleb128_t>(a) - static_cast<uleb128_t>(b));
+  }
+
+ public:
+  inline __device__ auto num_values() const { return _num_values; }
+
+  // Initialize the object. Only call from thread 0.
+  inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage)
+  {
+    _dst              = dest;
+    _num_values       = num_values;
+    _buffer           = buffer;
+    _scan_tmp         = reinterpret_cast<delta::index_scan::TempStorage*>(temp_storage);
+    _warp_tmp         = reinterpret_cast<delta::warp_reduce::TempStorage*>(temp_storage);
+    _block_tmp        = reinterpret_cast<delta::block_reduce::TempStorage*>(temp_storage);
+    _bitpack_tmp      = _buffer + delta::buffer_size;
+    _current_idx      = 0;
+    _values_in_buffer = 0;
+  }
+
+  // Each thread calls this to add its current value.
+  inline __device__ void add_value(T value, bool is_valid)
+  {
+    // Figure out the correct position for the given value.
+    size_type const valid = is_valid;
+    size_type pos;
+    size_type num_valid;
+    delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid);
+
+    if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      _values_in_buffer += num_valid;
+      // if first pass write header
+      if (_current_idx == 0) {
+        write_header();
+        _current_idx = 1;
+        _values_in_buffer -= 1;
+      }
+    }
+    __syncthreads();
+
+    if (_values_in_buffer >= delta::block_size) { flush(); }
+  }
+
+  // Called by each thread to flush data to the sink.
+  inline __device__ uint8_t const* flush()
+  {
+    using cudf::detail::warp_size;
+    __shared__ zigzag128_t block_min;
+
+    int const t       = threadIdx.x;
+    int const warp_id = t / warp_size;
+    int const lane_id = t % warp_size;
+
+    if (_values_in_buffer <= 0) { return _dst; }
+
+    // Calculate delta for this thread.
+    size_type const idx     = _current_idx + t;
+    zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
+                                                           _buffer[delta::rolling_idx(idx - 1)])
+                                                : std::numeric_limits<zigzag128_t>::max();
+
+    // Find min delta for the block.
+    auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min());
+
+    if (t == 0) { block_min = min_delta; }
+    __syncthreads();
+
+    // Compute frame of reference for the block.
+    uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0;
+
+    // Get max normalized delta for each warp, and use that to determine how many bits to use
+    // for the bitpacking of this warp.
+    zigzag128_t const warp_max =
+      delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max());
+    __syncwarp();
+
+    if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); }
+    __syncthreads();
+
+    // write block header
+    if (t == 0) { write_block_header(block_min); }
+    __syncthreads();
+
+    // Now each warp encodes its data...can calculate starting offset with _mb_bits.
+    // NOTE: using a switch here rather than a loop because the compiler produces code that
+    // uses fewer registers.
+    int cumulative_bits = 0;
+    switch (warp_id) {
+      case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]];
+      case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]];
+      case 1: cumulative_bits += _mb_bits[0];
+    }
+    uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8;
+
+    // encoding happens here
+    auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block;
+    if (warp_idx < _num_values) {
+      auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx);
+      if (_mb_bits[warp_id] > 32) {
+        delta::bitpack_mini_block<unsigned long long>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      } else {
+        delta::bitpack_mini_block<uint32_t>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      }
+    }
+    __syncthreads();
+
+    // Last warp updates global delta ptr.
+    if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) {
+      _dst              = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8;
+      _current_idx      = min(warp_idx + delta::values_per_mini_block, _num_values);
+      _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U);
+    }
+    __syncthreads();
+
+    return _dst;
+  }
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 78873d5e8ca..1e4f061d2e0 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "delta_enc.cuh"
 #include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
@@ -21,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -41,13 +43,19 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
+#include <bitset>
+
 namespace cudf::io::parquet::detail {
 
 namespace {
 
 using ::cudf::detail::device_2dspan;
 
-constexpr uint32_t rle_buffer_size = (1 << 9);
+constexpr int encode_block_size = 128;
+constexpr int rle_buffer_size   = 2 * encode_block_size;
+constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
+
+constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // do not truncate statistics
 constexpr int32_t NO_TRUNC_STATS = 0;
@@ -69,6 +77,7 @@ struct frag_init_state_s {
   PageFragment frag;
 };
 
+template <int rle_buf_size>
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
@@ -81,14 +90,15 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[4];
-  volatile uint32_t scratch_red[32];
+  volatile uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  uint32_t vals[rle_buffer_size];
+  uint32_t vals[rle_buf_size];
 };
 
+using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
+
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
@@ -205,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+/**
+ * @brief Determine the correct page encoding for the given page parameters.
+ *
+ * This is only used by the plain and dictionary encoders. Delta encoders will set the page
+ * encoding directly.
+ */
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
@@ -216,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type,
   switch (page_type) {
     case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
     case PageType::DATA_PAGE_V2:
-      // TODO need to work in delta encodings here when they're added
       return physical_type == BOOLEAN ? Encoding::RLE
              : use_dictionary         ? Encoding::RLE_DICTIONARY
                                       : Encoding::PLAIN;
@@ -236,6 +251,50 @@ struct BitwiseOr {
   }
 };
 
+// I is the column type from the input table
+template <typename I>
+__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s,
+                                       uint32_t valid_count,
+                                       uint64_t* buffer,
+                                       void* temp_space)
+{
+  using output_type = std::conditional_t<std::is_signed_v<I>, zigzag128_t, uleb128_t>;
+  __shared__ delta_binary_packer<output_type> packer;
+
+  auto const t = threadIdx.x;
+  if (t == 0) {
+    packer.init(s->cur, valid_count, reinterpret_cast<output_type*>(buffer), temp_space);
+  }
+  __syncthreads();
+
+  // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64.
+  // might need to modify this if there's a big performance hit in the 32-bit case.
+  int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size);
+
+    size_type const val_idx_in_block = cur_val_idx + t;
+    size_type const val_idx          = s->page_start_val + val_idx_in_block;
+
+    bool const is_valid =
+      (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values)
+        ? s->col.leaf_column->is_valid(val_idx)
+        : false;
+
+    cur_val_idx += nvals;
+
+    output_type v = s->col.leaf_column->element<I>(val_idx);
+    if (scale < 0) {
+      v /= -scale;
+    } else {
+      v *= scale;
+    }
+    packer.add_value(v, is_valid);
+  }
+
+  return packer.flush();
+}
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -323,6 +382,29 @@ __global__ void __launch_bounds__(128)
   }
 }
 
+__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values)
+{
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len     = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  auto const vals_per_block = delta::block_size;
+  size_t const num_blocks   = util::div_rounding_up_unsafe(num_values, vals_per_block);
+  // need max dtype_len + 1 bytes for min_delta
+  // one byte per mini block for the bitwidth
+  // and block_size * dtype_len bytes for the actual encoded data
+  auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len;
+
+  // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks,
+  // max 5 bytes for number of values, and max dtype_len + 1 for first value.
+  auto const header_size = 2 + 1 + 5 + dtype_len + 1;
+
+  return header_size + num_blocks * block_size;
+}
+
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
@@ -354,6 +436,14 @@ __global__ void __launch_bounds__(128)
     page_g = {};
   }
   __syncthreads();
+
+  // if writing delta encoded values, we're going to need to know the data length to get a guess
+  // at the worst case number of bytes needed to encode.
+  auto const physical_type = col_g.physical_type;
+  auto const type_id       = col_g.leaf_column->type().id();
+  auto const is_use_delta =
+    write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64);
+
   if (t < 32) {
     uint32_t fragments_in_chunk  = 0;
     uint32_t rows_in_page        = 0;
@@ -403,9 +493,12 @@ __global__ void __launch_bounds__(128)
       }
       __syncwarp();
       if (t == 0) {
-        if (not pages.empty()) pages[ck_g.first_page] = page_g;
-        if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size;
-        if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g;
+        if (not pages.empty()) {
+          page_g.kernel_mask     = encode_kernel_mask::PLAIN;
+          pages[ck_g.first_page] = page_g;
+        }
+        if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; }
+        if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; }
       }
       num_pages = 1;
     }
@@ -505,7 +598,12 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
+          // get a different bound if using delta encoding
+          if (is_use_delta) {
+            page_size =
+              max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values));
+          }
+          auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -525,7 +623,16 @@ __global__ void __launch_bounds__(128)
         }
         __syncwarp();
         if (t == 0) {
-          if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; }
+          if (not pages.empty()) {
+            if (is_use_delta) {
+              page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY;
+            } else if (ck_g.use_dictionary || physical_type == BOOLEAN) {
+              page_g.kernel_mask = encode_kernel_mask::DICTIONARY;
+            } else {
+              page_g.kernel_mask = encode_kernel_mask::PLAIN;
+            }
+            pages[ck_g.first_page + num_pages] = page_g;
+          }
           if (not page_sizes.empty()) {
             page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size;
           }
@@ -789,8 +896,12 @@ inline __device__ void PackLiterals(
  * @param[in] t thread id (0..127)
  */
 static __device__ void RleEncode(
-  page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
+  rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
 {
+  using cudf::detail::warp_size;
+  auto const lane_id = t % warp_size;
+  auto const warp_id = t / warp_size;
+
   uint32_t rle_pos = s->rle_pos;
   uint32_t rle_run = s->rle_run;
 
@@ -798,20 +909,20 @@ static __device__ void RleEncode(
     uint32_t pos = rle_pos + t;
     if (rle_run > 0 && !(rle_run & 1)) {
       // Currently in a long repeat run
-      uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val);
+      uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val);
       uint32_t rle_rpt_count, max_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu);
-        if (!t) {
+        if (t == 0) {
           uint32_t last_idx = __ffs(c32) - 1;
           s->rle_rpt_count =
-            last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
+            last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
         }
       }
       __syncthreads();
-      max_rpt_count = min(numvals - rle_pos, 128);
+      max_rpt_count = min(numvals - rle_pos, encode_block_size);
       rle_rpt_count = s->rle_rpt_count;
       rle_run += rle_rpt_count << 1;
       rle_pos += rle_rpt_count;
@@ -828,17 +939,17 @@ static __device__ void RleEncode(
       }
     } else {
       // New run or in a literal run
-      uint32_t v0      = s->vals[pos & (rle_buffer_size - 1)];
-      uint32_t v1      = s->vals[(pos + 1) & (rle_buffer_size - 1)];
+      uint32_t v0      = s->vals[rolling_idx(pos)];
+      uint32_t v1      = s->vals[rolling_idx(pos + 1)];
       uint32_t mask    = ballot(pos + 1 < numvals && v0 == v1);
-      uint32_t maxvals = min(numvals - rle_pos, 128);
+      uint32_t maxvals = min(numvals - rle_pos, encode_block_size);
       uint32_t rle_lit_count, rle_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         // Repeat run can only start on a multiple of 8 values
-        uint32_t idx8        = (t * 8) >> 5;
-        uint32_t pos8        = (t * 8) & 0x1f;
+        uint32_t idx8        = (t * 8) / warp_size;
+        uint32_t pos8        = (t * 8) % warp_size;
         uint32_t m0          = (idx8 < 4) ? s->rpt_map[idx8] : 0;
         uint32_t m1          = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0;
         uint32_t needed_mask = kRleRunMask[nbits - 1];
@@ -847,8 +958,8 @@ static __device__ void RleEncode(
           uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals;
           uint32_t rpt_len       = 0;
           if (rle_run_start < maxvals) {
-            uint32_t idx_cur = rle_run_start >> 5;
-            uint32_t idx_ofs = rle_run_start & 0x1f;
+            uint32_t idx_cur = rle_run_start / warp_size;
+            uint32_t idx_ofs = rle_run_start % warp_size;
             while (idx_cur < 4) {
               m0   = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0;
               m1   = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0;
@@ -857,7 +968,7 @@ static __device__ void RleEncode(
                 rpt_len += __ffs(mask) - 1;
                 break;
               }
-              rpt_len += 32;
+              rpt_len += warp_size;
               idx_cur++;
             }
           }
@@ -928,7 +1039,7 @@ static __device__ void RleEncode(
  * @param[in] flush nonzero if last batch in block
  * @param[in] t thread id (0..127)
  */
-static __device__ void PlainBoolEncode(page_enc_state_s* s,
+static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
                                        uint32_t numvals,
                                        uint32_t flush,
                                        uint32_t t)
@@ -938,7 +1049,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s,
 
   while (rle_pos < numvals) {
     uint32_t pos    = rle_pos + t;
-    uint32_t v      = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0;
+    uint32_t v      = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0;
     uint32_t n      = min(numvals - rle_pos, 128);
     uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3;
     if (!nbytes) { break; }
@@ -992,28 +1103,22 @@ __device__ auto julian_days_with_time(int64_t v)
   return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
+// this has been split out into its own kernel because of the amount of shared memory required
+// for the state buffer. encode kernels that don't use the RLE buffer can get started while
+// the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<EncPage> pages,
-                 device_span<device_span<uint8_t const>> comp_in,
-                 device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
+                                                                     bool write_v2_headers,
+                                                                     encode_kernel_mask kernel_mask)
 {
-  __shared__ __align__(8) page_enc_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
 
-  page_enc_state_s* const s = &state_g;
-  auto const t              = threadIdx.x;
+  auto* const s    = &state_g;
+  uint32_t const t = threadIdx.x;
 
   if (t == 0) {
-    state_g = page_enc_state_s{};
+    state_g = rle_page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1026,6 +1131,8 @@ __global__ void __launch_bounds__(128, 8)
   }
   __syncthreads();
 
+  if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; }
+
   auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
 
   // Encode Repetition and Definition levels
@@ -1078,23 +1185,24 @@ __global__ void __launch_bounds__(128, 8)
           } while (is_col_struct);
           return def;
         }();
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
+        s->vals[rolling_idx(rle_numvals + t)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
         RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur                = rle_out;
           s->page.def_lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     }
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
@@ -1121,29 +1229,121 @@ __global__ void __launch_bounds__(128, 8)
         uint32_t idx         = page_first_val_idx + rle_numvals + t;
         uint32_t lvl_val =
           (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0;
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val;
+        s->vals[rolling_idx(rle_numvals + t)] = lvl_val;
         __syncthreads();
         rle_numvals += nvals;
         RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur    = rle_out;
           lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     };
     encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
     encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
+
+  if (t == 0) { pages[blockIdx.x] = s->page; }
+}
+
+template <int block_size, typename state_buf>
+__device__ void finish_page_encode(state_buf* s,
+                                   uint32_t valid_count,
+                                   uint8_t const* end_ptr,
+                                   device_span<EncPage> pages,
+                                   device_span<device_span<uint8_t const>> comp_in,
+                                   device_span<device_span<uint8_t>> comp_out,
+                                   device_span<compression_result> comp_results,
+                                   bool write_v2_headers)
+{
+  auto const t = threadIdx.x;
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size =
+    write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0;
+
+  if (t == 0) {
+    // only need num_nulls for v2 data page headers
+    if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; }
+    uint8_t const* const base   = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(end_ptr - base);
+    if (actual_data_size > s->page.max_data_size) {
+      CUDF_UNREACHABLE("detected possible page data corruption");
+    }
+    s->page.max_data_size = actual_data_size;
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
+    }
+    pages[blockIdx.x] = s->page;
+    if (not comp_results.empty()) {
+      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    }
+  }
+
+  // copy uncompressed bytes over
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+// PLAIN page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodePages(device_span<EncPage> pages,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
+{
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+
   // Encode data values
   __syncthreads();
   auto const physical_type = s->col.physical_type;
@@ -1155,10 +1355,6 @@ __global__ void __launch_bounds__(128, 8)
     return dtype_len_out;
   }();
 
-  auto const dict_bits = (physical_type == BOOLEAN) ? 1
-                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
-                           ? s->ck.dict_rle_bits
-                           : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1167,219 +1363,315 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_out     = dst;
     s->page.encoding =
       determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
-    if (dict_bits >= 0 && physical_type != BOOLEAN) {
-      dst[0]     = dict_bits;
-      s->rle_out = dst + 1;
-    } else if (is_v2 && physical_type == BOOLEAN) {
-      // save space for RLE length. we don't know the total length yet.
-      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
-      s->rle_len_pos = dst;
-    }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+
   uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
 
     auto [is_valid, val_idx] = [&]() {
       uint32_t val_idx;
       uint32_t is_valid;
 
-      size_type val_idx_in_block = cur_val_idx + t;
+      size_type const val_idx_in_block = cur_val_idx + t;
       if (s->page.page_type == PageType::DICTIONARY_PAGE) {
         val_idx  = val_idx_in_block;
         is_valid = (val_idx < s->page.num_leaf_values);
         if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
       } else {
-        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+        size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
 
         is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
                     val_idx_in_block < s->page.num_leaf_values)
                      ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
                      : 0;
-        val_idx =
-          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+        val_idx  = val_idx_in_leaf_col;
       }
       return std::make_tuple(is_valid, val_idx);
     }();
 
-    if (is_valid) num_valid++;
-
+    if (is_valid) { num_valid++; }
     cur_val_idx += nvals;
-    if (dict_bits >= 0) {
-      // Dictionary encoding
-      if (dict_bits > 0) {
-        uint32_t rle_numvals;
-        uint32_t rle_numvals_in_block;
-        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
-        rle_numvals = s->rle_numvals;
-        if (is_valid) {
-          uint32_t v;
-          if (physical_type == BOOLEAN) {
-            v = s->col.leaf_column->element<uint8_t>(val_idx);
-          } else {
-            v = s->ck.dict_index[val_idx];
-          }
-          s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
-        }
-        rle_numvals += rle_numvals_in_block;
-        __syncthreads();
-        if (!is_v2 && physical_type == BOOLEAN) {
-          PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
-        } else {
-          RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
+
+    // Non-dictionary encoding
+    uint8_t* dst = s->cur;
+
+    if (is_valid) {
+      len = dtype_len_out;
+      if (physical_type == BYTE_ARRAY) {
+        if (type_id == type_id::STRING) {
+          len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+        } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
+          len +=
+            get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         }
-        __syncthreads();
       }
-      if (t == 0) { s->cur = s->rle_out; }
-      __syncthreads();
     } else {
-      // Non-dictionary encoding
-      uint8_t* dst = s->cur;
-
-      if (is_valid) {
-        len = dtype_len_out;
-        if (physical_type == BYTE_ARRAY) {
-          if (type_id == type_id::STRING) {
-            len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
-          } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
-            len +=
-              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
+      len = 0;
+    }
+    uint32_t total_len = 0;
+    block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
+    __syncthreads();
+    if (t == 0) { s->cur = dst + total_len; }
+    if (is_valid) {
+      switch (physical_type) {
+        case INT32: [[fallthrough]];
+        case FLOAT: {
+          auto const v = [dtype_len = dtype_len_in,
+                          idx       = val_idx,
+                          col       = s->col.leaf_column,
+                          scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
+            switch (dtype_len) {
+              case 8: return col->element<int64_t>(idx) * scale;
+              case 4: return col->element<int32_t>(idx) * scale;
+              case 2: return col->element<int16_t>(idx) * scale;
+              default: return col->element<int8_t>(idx) * scale;
+            }
+          }();
+
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+        } break;
+        case INT64: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
+          }
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          dst[pos + 4] = v >> 32;
+          dst[pos + 5] = v >> 40;
+          dst[pos + 6] = v >> 48;
+          dst[pos + 7] = v >> 56;
+        } break;
+        case INT96: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
           }
-        }
-      } else {
-        len = 0;
-      }
-      uint32_t total_len = 0;
-      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
-      __syncthreads();
-      if (t == 0) { s->cur = dst + total_len; }
-      if (is_valid) {
-        switch (physical_type) {
-          case INT32: [[fallthrough]];
-          case FLOAT: {
-            auto const v = [dtype_len = dtype_len_in,
-                            idx       = val_idx,
-                            col       = s->col.leaf_column,
-                            scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
-              switch (dtype_len) {
-                case 8: return col->element<int64_t>(idx) * scale;
-                case 4: return col->element<int32_t>(idx) * scale;
-                case 2: return col->element<int16_t>(idx) * scale;
-                default: return col->element<int8_t>(idx) * scale;
-              }
-            }();
 
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-          } break;
-          case INT64: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+          auto const [last_day_nanos, julian_days] = [&] {
+            using namespace cuda::std::chrono;
+            switch (s->col.leaf_column->type().id()) {
+              case type_id::TIMESTAMP_SECONDS:
+              case type_id::TIMESTAMP_MILLISECONDS: {
+                return julian_days_with_time<cuda::std::milli>(v);
+              } break;
+              case type_id::TIMESTAMP_MICROSECONDS:
+              case type_id::TIMESTAMP_NANOSECONDS: {
+                return julian_days_with_time<cuda::std::micro>(v);
+              } break;
             }
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            dst[pos + 4] = v >> 32;
-            dst[pos + 5] = v >> 40;
-            dst[pos + 6] = v >> 48;
-            dst[pos + 7] = v >> 56;
-          } break;
-          case INT96: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+            return julian_days_with_time<cuda::std::nano>(0);
+          }();
+
+          // the 12 bytes of fixed length data.
+          v             = last_day_nanos.count();
+          dst[pos + 0]  = v;
+          dst[pos + 1]  = v >> 8;
+          dst[pos + 2]  = v >> 16;
+          dst[pos + 3]  = v >> 24;
+          dst[pos + 4]  = v >> 32;
+          dst[pos + 5]  = v >> 40;
+          dst[pos + 6]  = v >> 48;
+          dst[pos + 7]  = v >> 56;
+          uint32_t w    = julian_days.count();
+          dst[pos + 8]  = w;
+          dst[pos + 9]  = w >> 8;
+          dst[pos + 10] = w >> 16;
+          dst[pos + 11] = w >> 24;
+        } break;
+
+        case DOUBLE: {
+          auto v = s->col.leaf_column->element<double>(val_idx);
+          memcpy(dst + pos, &v, 8);
+        } break;
+        case BYTE_ARRAY: {
+          auto const bytes = [](cudf::type_id const type_id,
+                                column_device_view const* leaf_column,
+                                uint32_t const val_idx) -> void const* {
+            switch (type_id) {
+              case type_id::STRING:
+                return reinterpret_cast<void const*>(
+                  leaf_column->element<string_view>(val_idx).data());
+              case type_id::LIST:
+                return reinterpret_cast<void const*>(
+                  get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
+              default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
+          }(type_id, s->col.leaf_column, val_idx);
+          uint32_t v   = len - 4;  // string length
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          if (v != 0) memcpy(dst + pos + 4, bytes, v);
+        } break;
+        case FIXED_LEN_BYTE_ARRAY: {
+          if (type_id == type_id::DECIMAL128) {
+            // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+            auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+            auto const v_char_ptr = reinterpret_cast<char const*>(&v);
+            thrust::copy(thrust::seq,
+                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                         thrust::make_reverse_iterator(v_char_ptr),
+                         dst + pos);
+          }
+        } break;
+      }
+    }
+    __syncthreads();
+  }
 
-            auto const [last_day_nanos, julian_days] = [&] {
-              using namespace cuda::std::chrono;
-              switch (s->col.leaf_column->type().id()) {
-                case type_id::TIMESTAMP_SECONDS:
-                case type_id::TIMESTAMP_MILLISECONDS: {
-                  return julian_days_with_time<cuda::std::milli>(v);
-                } break;
-                case type_id::TIMESTAMP_MICROSECONDS:
-                case type_id::TIMESTAMP_NANOSECONDS: {
-                  return julian_days_with_time<cuda::std::micro>(v);
-                } break;
-              }
-              return julian_days_with_time<cuda::std::nano>(0);
-            }();
-
-            // the 12 bytes of fixed length data.
-            v             = last_day_nanos.count();
-            dst[pos + 0]  = v;
-            dst[pos + 1]  = v >> 8;
-            dst[pos + 2]  = v >> 16;
-            dst[pos + 3]  = v >> 24;
-            dst[pos + 4]  = v >> 32;
-            dst[pos + 5]  = v >> 40;
-            dst[pos + 6]  = v >> 48;
-            dst[pos + 7]  = v >> 56;
-            uint32_t w    = julian_days.count();
-            dst[pos + 8]  = w;
-            dst[pos + 9]  = w >> 8;
-            dst[pos + 10] = w >> 16;
-            dst[pos + 11] = w >> 24;
-          } break;
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
 
-          case DOUBLE: {
-            auto v = s->col.leaf_column->element<double>(val_idx);
-            memcpy(dst + pos, &v, 8);
-          } break;
-          case BYTE_ARRAY: {
-            auto const bytes = [](cudf::type_id const type_id,
-                                  column_device_view const* leaf_column,
-                                  uint32_t const val_idx) -> void const* {
-              switch (type_id) {
-                case type_id::STRING:
-                  return reinterpret_cast<void const*>(
-                    leaf_column->element<string_view>(val_idx).data());
-                case type_id::LIST:
-                  return reinterpret_cast<void const*>(
-                    get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
-                default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
-              }
-            }(type_id, s->col.leaf_column, val_idx);
-            uint32_t v   = len - 4;  // string length
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            if (v != 0) memcpy(dst + pos + 4, bytes, v);
-          } break;
-          case FIXED_LEN_BYTE_ARRAY: {
-            if (type_id == type_id::DECIMAL128) {
-              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
-              auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
-              auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-              thrust::copy(thrust::seq,
-                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                           thrust::make_reverse_iterator(v_char_ptr),
-                           dst + pos);
-            }
-          } break;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DICTIONARY page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDictPages(device_span<EncPage> pages,
+                     device_span<device_span<uint8_t const>> comp_in,
+                     device_span<device_span<uint8_t>> comp_out,
+                     device_span<compression_result> comp_results,
+                     bool write_v2_headers)
+{
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = rle_page_enc_state_s{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  // TODO assert dict_bits >= 0
+  auto const dict_bits = (physical_type == BOOLEAN) ? 1
+                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                           ? s->ck.dict_rle_bits
+                           : -1;
+  if (t == 0) {
+    uint8_t* dst   = s->cur;
+    s->rle_run     = 0;
+    s->rle_pos     = 0;
+    s->rle_numvals = 0;
+    s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    if (dict_bits >= 0 && physical_type != BOOLEAN) {
+      dst[0]     = dict_bits;
+      s->rle_out = dst + 1;
+    } else if (write_v2_headers && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
+    }
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  uint32_t num_valid = 0;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
+
+    auto [is_valid, val_idx] = [&]() {
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                                 val_idx_in_block < s->page.num_leaf_values)
+                                  ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                                  : 0;
+      // need to test for use_dictionary because it might be boolean
+      uint32_t const val_idx =
+        (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      return std::make_tuple(is_valid, val_idx);
+    }();
+
+    if (is_valid) { num_valid++; }
+    cur_val_idx += nvals;
+
+    // Dictionary encoding
+    if (dict_bits > 0) {
+      uint32_t rle_numvals;
+      uint32_t rle_numvals_in_block;
+      uint32_t pos;
+      block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+      rle_numvals = s->rle_numvals;
+      if (is_valid) {
+        uint32_t v;
+        if (physical_type == BOOLEAN) {
+          v = s->col.leaf_column->element<uint8_t>(val_idx);
+        } else {
+          v = s->ck.dict_index[val_idx];
         }
+        s->vals[rolling_idx(rle_numvals + pos)] = v;
+      }
+      rle_numvals += rle_numvals_in_block;
+      __syncthreads();
+      if ((!write_v2_headers) && (physical_type == BOOLEAN)) {
+        PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
+      } else {
+        RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
       }
       __syncthreads();
     }
+    if (t == 0) { s->cur = s->rle_out; }
+    __syncthreads();
   }
 
   uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
@@ -1392,37 +1684,137 @@ __global__ void __launch_bounds__(128, 8)
     __syncwarp();
   }
 
-  // V2 does not compress rep and def level data
-  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DELTA_BINARY_PACKED page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<compression_result> comp_results)
+{
+  // block of shared memory for value storage and bit packing
+  __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size];
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename delta::index_scan::TempStorage delta_index_tmp;
+    typename delta::block_reduce::TempStorage delta_reduce_tmp;
+    typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks];
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
 
   if (t == 0) {
-    s->page.num_nulls           = s->page.num_values - valid_count;
-    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
-    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
-    if (actual_data_size > s->page.max_data_size) {
-      CUDF_UNREACHABLE("detected possible page data corruption");
-    }
-    s->page.max_data_size = actual_data_size;
-    if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
-                              0};  // size is unused
-    }
-    pages[blockIdx.x] = s->page;
-    if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
-      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  if (t == 0) {
+    uint8_t* dst       = s->cur;
+    s->rle_run         = 0;
+    s->rle_pos         = 0;
+    s->rle_numvals     = 0;
+    s->rle_out         = dst;
+    s->page.encoding   = Encoding::DELTA_BINARY_PACKED;
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  // need to know the number of valid values for the null values calculation and to size
+  // the delta binary encoder.
+  uint32_t valid_count = 0;
+  if (not s->col.leaf_column->nullable()) {
+    valid_count = s->page.num_leaf_values;
+  } else {
+    uint32_t num_valid = 0;
+    for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+      uint32_t const nvals                = min(s->page.num_leaf_values - cur_val_idx, block_size);
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      if (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+          val_idx_in_block < s->page.num_leaf_values &&
+          s->col.leaf_column->is_valid(val_idx_in_leaf_col)) {
+        num_valid++;
+      }
+      cur_val_idx += nvals;
     }
+    valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
   }
 
-  // copy over uncompressed data
-  if (skip_comp_size != 0 && not comp_in.empty()) {
-    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
-    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
-    for (int i = t; i < skip_comp_size; i += block_size) {
-      dst[i] = src[i];
+  uint8_t const* delta_ptr = nullptr;  // this will be the end of delta block pointer
+
+  if (physical_type == INT32) {
+    switch (dtype_len_in) {
+      case 8: {
+        // only DURATIONS map to 8 bytes, so safe to just use signed here?
+        delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
+        break;
+      }
+      case 4: {
+        if (type_id == type_id::UINT32) {
+          delta_ptr = delta_encode<uint32_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int32_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 2: {
+        if (type_id == type_id::UINT16) {
+          delta_ptr = delta_encode<uint16_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int16_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 1: {
+        if (type_id == type_id::UINT8) {
+          delta_ptr = delta_encode<uint8_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int8_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED");
+    }
+  } else {
+    if (type_id == type_id::UINT64) {
+      delta_ptr = delta_encode<uint64_t>(s, valid_count, delta_shared, &temp_storage);
+    } else {
+      delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
     }
   }
+
+  finish_page_encode<block_size>(
+    s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true);
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1457,7 +1849,8 @@ __global__ void __launch_bounds__(decide_compression_block_size)
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
-    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
+    auto const is_v2          = curr_page.page_type == PageType::DATA_PAGE_V2;
+    auto const lvl_bytes      = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
@@ -1920,7 +2313,8 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
-      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
+      auto const is_v2     = page_g.page_type == PageType::DATA_PAGE_V2;
+      auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0;
       hdr_start            = page_g.compressed_data;
       compressed_page_size =
         static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
@@ -2155,6 +2549,10 @@ constexpr __device__ void* align8(void* ptr)
   return static_cast<char*>(ptr) - algn;
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(EncPage const& p) { return static_cast<uint32_t>(p.kernel_mask); }
+};
+
 }  // namespace
 
 // blockDim(1, 1, 1)
@@ -2257,8 +2655,9 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             rmm::cuda_stream_view stream)
 {
   int const num_fragments = fragments.size();
-  int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size);
-  gpuInitFragmentStats<<<dim, 128, 0, stream.value()>>>(groups, fragments);
+  int const dim =
+    util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size);
+  gpuInitFragmentStats<<<dim, encode_block_size, 0, stream.value()>>>(groups, fragments);
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
@@ -2277,18 +2676,18 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 {
   auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
-  gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(chunks,
-                                                     pages,
-                                                     page_sizes,
-                                                     comp_page_sizes,
-                                                     col_desc,
-                                                     page_grstats,
-                                                     chunk_grstats,
-                                                     num_columns,
-                                                     max_page_size_bytes,
-                                                     max_page_size_rows,
-                                                     page_align,
-                                                     write_v2_headers);
+  gpuInitPages<<<dim_grid, encode_block_size, 0, stream.value()>>>(chunks,
+                                                                   pages,
+                                                                   page_sizes,
+                                                                   comp_page_sizes,
+                                                                   col_desc,
+                                                                   page_grstats,
+                                                                   chunk_grstats,
+                                                                   num_columns,
+                                                                   max_page_size_bytes,
+                                                                   max_page_size_rows,
+                                                                   page_align,
+                                                                   write_v2_headers);
 }
 
 void EncodePages(device_span<EncPage> pages,
@@ -2299,10 +2698,43 @@ void EncodePages(device_span<EncPage> pages,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
+
+  // determine which kernels to invoke
+  auto mask_iter       = thrust::make_transform_iterator(pages.begin(), mask_tform{});
+  uint32_t kernel_mask = thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
+
+  // get the number of streams we need from the pool
+  int nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams = cudf::detail::fork_streams(stream, nkernels);
+
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
-    pages, comp_in, comp_out, comp_results, write_v2_headers);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::PLAIN);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY);
+    gpuEncodeDeltaBinaryPages<encode_block_size>
+      <<<num_pages, encode_block_size, 0, strm.value()>>>(pages, comp_in, comp_out, comp_results);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DICTIONARY);
+    gpuEncodeDictPages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+
+  cudf::detail::join_streams(streams, stream);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2320,7 +2752,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
-  gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
+  gpuEncodePageHeaders<<<pages.size(), encode_block_size, 0, stream.value()>>>(
     pages, comp_results, page_stats, chunk_stats);
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 6a93fec0c46..048f1a73a9c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -88,6 +88,37 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
+// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// this is in C++23
+#if !defined(__cpp_lib_is_scoped_enum)
+template <typename Enum, bool = std::is_enum_v<Enum>>
+struct is_scoped_enum {
+  static const bool value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
+};
+
+template <typename Enum>
+struct is_scoped_enum<Enum, false> {
+  static const bool value = false;
+};
+#else
+using std::is_scoped_enum;
+#endif
+
+// helpers to do bit operations on scoped enums
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitAnd(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
+}
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -371,6 +402,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
   return 1 << static_cast<uint32_t>(encoding);
 }
 
+/**
+ * @brief Enum of mask bits for the EncPage kernel_mask
+ *
+ * Used to control which encode kernels to run.
+ */
+enum class encode_kernel_mask {
+  PLAIN        = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY   = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY = (1 << 2)   // Run DELTA_BINARY_PACKED encoding kernel
+};
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -429,10 +471,11 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
-  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
-  compression_result* comp_res;  //!< Ptr to compression result
-  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
+  uint32_t def_lvl_bytes;          //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;          //!< Number of bytes of encoded repetition level data (V2 only)
+  compression_result* comp_res;    //!< Ptr to compression result
+  uint32_t num_nulls;              //!< Number of null values (V2 only) (down here for alignment)
+  encode_kernel_mask kernel_mask;  //!< Mask used to control which encoding kernels to run
 };
 
 /**
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index fa85e3a4a1d..2a654bd7e8c 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -353,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
 template <typename T>
 struct ParquetReaderSourceTest : public ParquetReaderTest {};
 
+template <typename T>
+struct ParquetWriterDeltaTest : public ParquetWriterTest {};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
@@ -384,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
-// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
 INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          ParquetSizedTest,
                          testing::Range(1, 25),
@@ -6698,7 +6700,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   // data should be PLAIN for v1, RLE for V2
   auto col0_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
-  // data should be PLAIN for both
+  // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2
   auto col1_data = random_values<int32_t>(num_rows);
   // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
   auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
@@ -6733,10 +6735,10 @@ TEST_P(ParquetV2Test, CheckEncodings)
     // col0 should have RLE for rep/def and data
     EXPECT_TRUE(chunk0_enc.size() == 1);
     EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
-    // col1 should have RLE for rep/def and PLAIN for data
+    // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data
     EXPECT_TRUE(chunk1_enc.size() == 2);
     EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
-    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED));
     // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
     EXPECT_TRUE(chunk2_enc.size() == 3);
     EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
@@ -6758,6 +6760,104 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly.
+// see definition of UnsupportedChronoTypes above.
+using DeltaDecimalTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using DeltaBinaryTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::ChronoTypes, DeltaDecimalTypes>;
+using SupportedDeltaTestTypes =
+  cudf::test::RemoveIf<cudf::test::ContainedIn<UnsupportedChronoTypes>, DeltaBinaryTypes>;
+TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes);
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes)
+{
+  using T   = TypeParam;
+  auto col0 = testdata::ascending<T>();
+  auto col1 = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced)
+{
+  using T                = TypeParam;
+  constexpr int num_rows = 4'000;
+  auto col0              = testdata::ascending<T>();
+  auto col1              = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+  auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_rows);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
+{
+  using T = TypeParam;
+
+  constexpr int num_slice = 4'000;
+  constexpr int num_rows  = 32 * 1024;
+
+  std::mt19937 gen(6542);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  // list<T>
+  constexpr int vals_per_row = 4;
+  auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
+    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<T> c1_vals(
+    values, values + (num_rows * vals_per_row), valids);
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+  auto _c1 = cudf::make_lists_column(
+    num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask));
+  auto c1 = cudf::purge_nonempty_nulls(*_c1);
+
+  auto const expected = table_view{{*c1}};
+  auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_slice);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
 TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
 {
   char const* const min_val = "";

From 0341bb7cebfab1fb45d4a53cfc495265bb96ee3a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 20 Oct 2023 10:46:02 -0700
Subject: [PATCH 3/7] Expose streams in public null mask APIs (#14263)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14263
---
 cpp/include/cudf/lists/detail/scatter.cuh |  6 +-
 cpp/include/cudf/null_mask.hpp            | 24 +++++-
 cpp/src/binaryop/binaryop.cpp             |  2 +-
 cpp/src/bitmask/null_mask.cu              | 38 +++++++---
 cpp/src/copying/concatenate.cu            |  2 +-
 cpp/src/copying/scatter.cu                |  5 +-
 cpp/src/groupby/hash/groupby.cu           |  3 +-
 cpp/src/lists/contains.cu                 | 16 ++--
 cpp/src/merge/merge.cu                    |  2 +-
 cpp/src/round/round.cu                    | 16 +++-
 cpp/src/search/contains_column.cu         |  2 +-
 cpp/src/strings/replace/multi.cu          |  2 +-
 cpp/src/strings/split/split_re.cu         |  2 +-
 cpp/src/strings/split/split_record.cu     |  6 +-
 cpp/src/unary/cast_ops.cu                 |  8 +-
 cpp/src/unary/math_ops.cu                 |  8 +-
 cpp/tests/CMakeLists.txt                  |  1 +
 cpp/tests/streams/null_mask_test.cpp      | 92 +++++++++++++++++++++++
 18 files changed, 191 insertions(+), 44 deletions(-)
 create mode 100644 cpp/tests/streams/null_mask_test.cpp

diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index f04b2fda2bf..ff148c59a23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,9 +20,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/lists/list_device_view.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -130,8 +130,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(offsets_column));
   children.emplace_back(std::move(child_column));
-  auto null_mask =
-    target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
+  auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr)
+                                      : rmm::device_buffer{0, stream, mr};
 
   // The output column from this function only has null masks copied from the target columns.
   // That is still not a correct final null mask for the scatter result.
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 672f479ad53..524296e60ca 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits);
  *
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
@@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
@@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
  * @param mask Bitmask residing in device memory whose bits will be copied
  * @param begin_bit Index of the first bit to be copied (inclusive)
  * @param end_bit Index of the last bit to be copied (exclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
@@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask(
  * Returns empty `device_buffer` if the column is not nullable
  *
  * @param view Column view whose bitmask needs to be copied
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -163,11 +177,13 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -183,8 +199,12 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param bitmask Validity bitmask residing in device memory.
  * @param start Index of the first bit to count (inclusive).
  * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The number of null elements in the specified range.
  */
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop);
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 6b413ab2be4..53b04c4ca80 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -366,7 +366,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");
 
-  auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
+  auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out =
     make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 5a0d3e4f120..3ff56eabe1e 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask,
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::create_null_mask(size, state, cudf::get_default_stream(), mr);
+  return detail::create_null_mask(size, state, stream, mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream)
 {
-  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream());
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream);
 }
 
 namespace detail {
@@ -511,33 +516,46 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_and(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_and(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_or(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_or(view, stream, mr);
 }
 
 // Count non-zero bits in the specified range
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop)
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream)
 {
-  return detail::null_count(bitmask, start, stop, cudf::get_default_stream());
+  CUDF_FUNC_RANGE();
+  return detail::null_count(bitmask, start, stop, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d08c3025553..9b9e780965a 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
       });
 
     rmm::device_buffer null_mask =
-      create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
+      cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr);
 
     detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), stream);
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 11c27fc86e3..879ddb5048e 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     // Compute null mask
     rmm::device_buffer null_mask =
-      target.nullable() ? copy_bitmask(target, stream, mr)
-                        : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
+      target.nullable()
+        ? detail::copy_bitmask(target, stream, mr)
+        : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
     column null_mask_stub(data_type{type_id::STRUCT},
                           target.size(),
                           rmm::device_buffer{},
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 506832881a9..195c8924c9a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
-  auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4733a5d63a8..cd2bc493bc7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
@@ -274,12 +275,13 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (!search_key.is_valid(stream)) {
-    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                               lists.size(),
-                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
-                               lists.size(),
-                               stream,
-                               mr);
+    return make_numeric_column(
+      data_type{cudf::type_to_id<size_type>()},
+      lists.size(),
+      cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr),
+      lists.size(),
+      stream,
+      mr);
   }
   if (lists.size() == 0) {
     return make_numeric_column(
@@ -337,7 +339,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
                                     lists.size(),
-                                    copy_bitmask(lists_cv, stream, mr),
+                                    cudf::detail::copy_bitmask(lists_cv, stream, mr),
                                     lists_cv.null_count(),
                                     stream,
                                     mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index c0765b48205..00a2f0bee8f 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -381,7 +381,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   // materialize the output buffer
   rmm::device_buffer validity =
     lcol.has_nulls() || rcol.has_nulls()
-      ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
+      ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
       : rmm::device_buffer{};
   if (lcol.has_nulls() || rcol.has_nulls()) {
     materialize_bitmask(lcol,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 41cce57d55b..8a6367a1f87 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -219,8 +219,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (decimal_places >= 0 && std::is_integral_v<T>)
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
   T const n     = std::pow(10, std::abs(decimal_places));
@@ -256,8 +260,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (input.type().scale() > -decimal_places)
     return cudf::detail::cast(input, result_type, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(result_type,
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 85971647434..b8c7d058535 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -42,7 +42,7 @@ struct contains_column_dispatch {
                                      stream,
                                      mr);
     return std::make_unique<column>(
-      std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count());
+      std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count());
   }
 };
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index ee47932100a..f80ace57c69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -383,7 +383,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                              std::move(offsets),
                              std::move(chars->release().children.back()),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr));
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 913aec79758..045aac279e6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -290,7 +290,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 52f27c68111..7a0cfb9ef41 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -57,7 +57,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                              std::move(offsets),
                              std::move(results),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
                              stream,
                              mr);
   }
@@ -72,7 +72,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_child),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -160,7 +160,7 @@ std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& in
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 1c81f266200..6fa87b1f709 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -194,7 +194,7 @@ std::unique_ptr<column> rescale(column_view input,
       auto const scalar  = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
       auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr);
       if (input.nullable()) {
-        auto const null_mask = copy_bitmask(input, stream, mr);
+        auto const null_mask = detail::copy_bitmask(input, stream, mr);
         output_column->set_null_mask(std::move(null_mask), input.null_count());
       }
       return output_column;
@@ -255,7 +255,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -285,7 +285,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -334,7 +334,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream, mr),
+                                             detail::copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d0cae81a9c8..d84e0171b49 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -291,8 +291,12 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
        std::is_same_v<FixedPointUnaryOpFunctor, fixed_point_floor<Type>>))
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3e30db7abcb..16e7239ebd8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,6 +629,7 @@ ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
new file mode 100644
index 00000000000..7e59201c8cf
--- /dev/null
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class NullMaskTest : public cudf::test::BaseFixture {};
+
+TEST_F(NullMaskTest, CreateNullMask)
+{
+  cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, SetNullMask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> col({0, 1, 0, 1, 1},
+                                                   {true, false, true, false, false});
+
+  cudf::set_null_mask(static_cast<cudf::mutable_column_view>(col).null_mask(),
+                      0,
+                      3,
+                      false,
+                      cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 3, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmaskFromColumn)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(col, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskAnd)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_and(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskOr)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_or(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, NullCount)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, true, false, false, true});
+
+  cudf::null_count(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 4, cudf::test::get_default_stream());
+}

From e7c6365a4976881dc3cf0bcbfa254eb664cfe877 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 11:32:38 -0700
Subject: [PATCH 4/7] Detect and report errors in Parquet header parsing
 (#14237)

Fixes #13656.  Uses the error reporting introduced in #14167 to report errors in header parsing.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14237
---
 cpp/src/io/parquet/error.hpp                  |  77 ++++++++++++++++++
 cpp/src/io/parquet/page_data.cu               |   5 +-
 cpp/src/io/parquet/page_delta_decode.cu       |   5 +-
 cpp/src/io/parquet/page_hdr.cu                |  58 ++++++-------
 cpp/src/io/parquet/page_string_decode.cu      |   5 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  32 +++++++-
 cpp/src/io/parquet/reader_impl.cpp            |  11 ++-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  44 ++++------
 .../cudf/tests/data/parquet/bad_dict.parquet  | Bin 0 -> 2850 bytes
 python/cudf/cudf/tests/test_parquet.py        |   8 ++
 10 files changed, 170 insertions(+), 75 deletions(-)
 create mode 100644 cpp/src/io/parquet/error.hpp
 create mode 100644 python/cudf/cudf/tests/data/parquet/bad_dict.parquet

diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
new file mode 100644
index 00000000000..92b5eebe9fd
--- /dev/null
+++ b/cpp/src/io/parquet/error.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace cudf::io::parquet {
+
+/**
+ * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * kernel calls.
+ *
+ * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
+ * the object's lifetime.
+ */
+class kernel_error {
+ private:
+  rmm::device_scalar<int32_t> _error_code;
+
+ public:
+  /**
+   * @brief Construct a new `kernel_error` with an initial value of 0.
+   *
+   * Note: the initial value is set asynchronously.
+   *
+   * @throws `rmm::bad_alloc` if allocating the device memory for `initial_value` fails.
+   * @throws `rmm::cuda_error` if copying `initial_value` to device memory fails.
+   *
+   * @param CUDA stream to use
+   */
+  kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {}
+
+  /**
+   * @brief Return a pointer to the device memory for the error
+   */
+  [[nodiscard]] auto data() { return _error_code.data(); }
+
+  /**
+   * @brief Return the current value of the error
+   *
+   * This uses the stream used to create this instance. This does a synchronize on the stream
+   * this object was instantiated with.
+   */
+  [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); }
+
+  /**
+   * @brief Return a hexadecimal string representation of the current error code
+   *
+   * Returned string will have "0x" prepended.
+   */
+  [[nodiscard]] std::string str() const
+  {
+    std::stringstream sstream;
+    sstream << std::hex << value();
+    return "0x" + sstream.str();
+  }
+};
+
+}  // namespace cudf::io::parquet
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index cce3659b902..a783b489c02 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -599,10 +599,7 @@ __global__ void __launch_bounds__(decode_block_size)
     }
     __syncthreads();
   }
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 struct mask_tform {
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index d25684a59f3..bb5e5066b69 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -151,10 +151,7 @@ __global__ void __launch_bounds__(96)
     __syncthreads();
   }
 
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index eae8e05e61e..22add2fffc6 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -16,6 +16,9 @@
 
 #include "parquet_gpu.hpp"
 #include <io/utilities/block_utils.cuh>
+
+#include <cudf/detail/utilities/cuda.cuh>
+
 #include <thrust/tuple.h>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,23 +28,6 @@ namespace cudf::io::parquet::detail {
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
-static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
-                                                                  1,
-                                                                  2,
-                                                                  ST_FLD_BYTE,
-                                                                  ST_FLD_DOUBLE,
-                                                                  5,
-                                                                  ST_FLD_I16,
-                                                                  7,
-                                                                  ST_FLD_I32,
-                                                                  9,
-                                                                  ST_FLD_I64,
-                                                                  ST_FLD_BINARY,
-                                                                  ST_FLD_STRUCT,
-                                                                  ST_FLD_MAP,
-                                                                  ST_FLD_SET,
-                                                                  ST_FLD_LIST};
-
 struct byte_stream_s {
   uint8_t const* cur{};
   uint8_t const* end{};
@@ -140,12 +126,13 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
       case ST_FLD_SET: {  // NOTE: skipping a list of lists is not handled
         auto const c = getb(bs);
         int n        = c >> 4;
-        if (n == 0xf) n = get_u32(bs);
-        field_type = g_list2struct[c & 0xf];
-        if (field_type == ST_FLD_STRUCT)
+        if (n == 0xf) { n = get_u32(bs); }
+        field_type = c & 0xf;
+        if (field_type == ST_FLD_STRUCT) {
           struct_depth += n;
-        else
+        } else {
           rep_cnt = n;
+        }
       } break;
       case ST_FLD_STRUCT: struct_depth++; break;
     }
@@ -356,16 +343,20 @@ struct gpuParsePageHeader {
  */
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks)
+  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
   __shared__ byte_stream_s bs_g[4];
 
-  int lane_id             = threadIdx.x % 32;
-  int chunk               = (blockIdx.x * 4) + (threadIdx.x / 32);
-  byte_stream_s* const bs = &bs_g[threadIdx.x / 32];
+  int32_t error[4]   = {0};
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+  auto const chunk   = (blockIdx.x * 4) + warp_id;
+  auto const bs      = &bs_g[warp_id];
 
-  if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk];
+  if (chunk < num_chunks and lane_id == 0) { bs->ck = chunks[chunk]; }
+  if (lane_id == 0) { error[warp_id] = 0; }
   __syncthreads();
 
   if (chunk < num_chunks) {
@@ -376,7 +367,7 @@ __global__ void __launch_bounds__(128)
     int32_t num_dict_pages = bs->ck.num_dict_pages;
     PageInfo* page_info;
 
-    if (!lane_id) {
+    if (lane_id == 0) {
       bs->base = bs->cur      = bs->ck.compressed_data;
       bs->end                 = bs->base + bs->ck.compressed_size;
       bs->page.chunk_idx      = chunk;
@@ -412,6 +403,9 @@ __global__ void __launch_bounds__(128)
         bs->page.lvl_bytes[level_type::DEFINITION] = 0;
         bs->page.lvl_bytes[level_type::REPETITION] = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
+          if (not is_supported_encoding(bs->page.encoding)) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::UNSUPPORTED_ENCODING);
+          }
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
               index_out = num_dict_pages + data_page_count;
@@ -440,20 +434,25 @@ __global__ void __launch_bounds__(128)
           }
           bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
+          if (bs->cur > bs->end) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::DATA_STREAM_OVERRUN);
+          }
           bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck);
         } else {
           bs->cur = bs->end;
         }
       }
       index_out = shuffle(index_out);
-      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0)
+      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) {
         page_info[index_out] = bs->page;
+      }
       num_values = shuffle(num_values);
       __syncwarp();
     }
     if (lane_id == 0) {
       chunks[chunk].num_data_pages = data_page_count;
       chunks[chunk].num_dict_pages = dictionary_page_count;
+      if (error[warp_id] != 0) { set_error(error[warp_id], error_code); }
     }
   }
 }
@@ -509,11 +508,12 @@ __global__ void __launch_bounds__(128)
 
 void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
+                                int32_t* error_code,
                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks, error_code);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4d79770ec34..4c7d8e3c20a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -745,10 +745,7 @@ __global__ void __launch_bounds__(decode_block_size)
   auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 048f1a73a9c..164e2cea2ed 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -31,6 +31,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
+
 #include <cuda_runtime.h>
 
 #include <vector>
@@ -54,6 +56,30 @@ constexpr int rolling_index(int index)
   return index % rolling_size;
 }
 
+// see setupLocalPageInfo() in page_decode.cuh for supported page encodings
+constexpr bool is_supported_encoding(Encoding enc)
+{
+  switch (enc) {
+    case Encoding::PLAIN:
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE:
+    case Encoding::RLE_DICTIONARY:
+    case Encoding::DELTA_BINARY_PACKED: return true;
+    default: return false;
+  }
+}
+
+/**
+ * @brief Atomically OR `error` into `error_code`.
+ */
+constexpr void set_error(int32_t error, int32_t* error_code)
+{
+  if (error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(error, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Enum for the different types of errors that can occur during decoding.
  *
@@ -495,9 +521,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
+void DecodePageHeaders(ColumnChunkDesc* chunks,
+                       int32_t num_chunks,
+                       int32_t* error_code,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index db81222157a..11c20d0e540 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "reader_impl.hpp"
+#include "error.hpp"
 
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
@@ -163,7 +164,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  rmm::device_scalar<int32_t> error_code(0, _stream);
+  // create this before we fork streams
+  kernel_error error_code(_stream);
 
   // get the number of streams we need from the pool and tell them to wait on the H2D copies
   int const nkernels = std::bitset<32>(kernel_mask).count();
@@ -199,11 +201,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
-  auto const decode_error = error_code.value(_stream);
-  if (decode_error != 0) {
-    std::stringstream stream;
-    stream << std::hex << decode_error;
-    CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str());
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
 
   // for list columns, add the final offset to every offset buffer.
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ce45f709ee1..8494dc72a1d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "error.hpp"
 #include "reader_impl.hpp"
 
 #include <io/comp/nvcomp_adapter.hpp>
@@ -263,10 +264,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 {
   size_t total_pages = 0;
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
   chunks.device_to_host_sync(stream);
 
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
+  }
+
   for (size_t c = 0; c < chunks.size(); c++) {
     total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
   }
@@ -274,19 +280,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   return total_pages;
 }
 
-// see setupLocalPageInfo() in page_data.cu for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
-{
-  switch (enc) {
-    case Encoding::PLAIN:
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE:
-    case Encoding::RLE_DICTIONARY:
-    case Encoding::DELTA_BINARY_PACKED: return true;
-    default: return false;
-  }
-}
-
 /**
  * @brief Decode the page information from the given column chunks.
  *
@@ -307,8 +300,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
     page_count += chunks[c].max_num_pages;
   }
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
+
+  if (error_code.value() != 0) {
+    // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one
+    CUDF_FAIL("Parquet header parsing failed with code(s)" + error_code.str());
+  }
 
   // compute max bytes needed for level data
   auto level_bit_size =
@@ -318,22 +317,13 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
-  int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
+  int const max_level_bits = thrust::reduce(rmm::exec_policy(stream),
                                             level_bit_size,
                                             level_bit_size + chunks.size(),
                                             0,
                                             thrust::maximum<int>());
-  auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
-
-  pages.device_to_host_sync(stream);
 
-  // validate page encodings
-  CUDF_EXPECTS(std::all_of(pages.begin(),
-                           pages.end(),
-                           [](auto const& page) { return is_supported_encoding(page.encoding); }),
-               "Unsupported page encoding detected");
-
-  return level_type_size;
+  return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 }
 
 /**
@@ -771,6 +761,7 @@ void reader::impl::load_and_decompress_data()
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
+  pages.device_to_host_sync(_stream);
   if (has_compressed_data) {
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
@@ -795,7 +786,6 @@ void reader::impl::load_and_decompress_data()
   // std::vector<output_column_info> output_info = build_output_column_info();
 
   // the following two allocate functions modify the page data
-  pages.device_to_host_sync(_stream);
   {
     // nesting information (sizes, etc) stored -per page-
     // note : even for flat schemas, we allocate 1 level of "nesting" info
diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5008ac0b22b622cbcf37d5e286c749324fc26535
GIT binary patch
literal 2850
zcmb7G&2Af26n0F~Hfi|Brja6Y7Z7#fSP&MJN~pStO)_mfb{z*P0aRU&ubpYebEosu
z#$t!MKs*8qR@ng0z>-bZJOz)yito(KnX$tHVfi>WbI<?xoqMMz`=8V*wLh2eZPa$L
zyivPVsZ=YK%I*?=D)dADy#`r^tUzu+ZbG0~S%s`YZbQ}~uS4E|ya{;=@;2lh$h(mD
zAT>xG@;>Aa{l8rMmA*#^4*%p<9v!s%mL$)zf6%rEmMtmdUSFPBN3`0PwrojCrG0SP
z8&DqY!J*adS*hZeeQEEv$rT)Rq<u=7mfgyHV$rqR$CjjWEU4yxACuNY*^@Sfc-ZZ?
zq)m?=we3?$&N|(GcYnW2)pojPva@GBK53J2uiZZ(&HY1UN}ByHqFR#raV$?-mXLk9
zXQf+TS?z%>I~1aSa*QmfStqtNXt%^8%RXpzJJiDb$?-rA+Q&fUUhN*ivpf&a%BuK5
z{`mRbU;g;%yN_R4zx_`AM1%hWO+u7}GN%tAOF}4<&~p-AV_AHzvw@jG_OyotUwwdY
zj^?L$B+k`j;_0SvRH(;^=gq{}^_3?^IxvwNM_9z#iAChiu%{-<J%Sk&#^zE-A_&dY
zjWlmNjtNJuKM}DJ@wpbEo+O?Mg~CSS$5B%ZjW@#MXNivBTzSU7fIoP~Y7)64EzCGI
z4YZ$n;zZ_Ci~{Yr$RSY9g_<A@Brnv^Byno$hQ@atWEf(vndinf8sW5xP&m@eG#esv
zJ?+PZKeplY7vhYQ&@~DAA<T0frU7FY#Uj$!OU7dtJ5;5U)is1xs#JD3@slV)`Y<vh
z<GFEao3%Zepo*-ZXNl`vP}7~zL{aLajLoTywh-pI_KV!b3w5DIZre-|*oMj?x(q<J
z4}=<7EAB%RZS_>FJxy?N7zR*W?u2S~U*sh;78E3`k*L0Ok_ezU>Z$QcY7|9qH=8BN
zg3v|#RS1%VaFRo&FN09Sp^9eHAU0{6I3cD*PT<U9o+CL4&W&@SXH?QmhY{4Y4#W^W
zKiA%@)Zazaod5q~Vn+R9EWk_cJ1$|i2P<M)k<8GT2zA6JNa11xGh1%@14bur-iPp#
z5()$&5;V}ktni6D1{d-^s?VS|AB;%PmCUif<QexWP~XB-@I)~_+31UsQ(Q)qRm)5Q
z;A6y4D?iEzR!}`qs5m}DfY3)viiFZZ$JB?!<|&3d8v@TnXna8%=7IoY<}yi*3mQJy
zfp_@kY32DTd6DH*#15arpfI0Hid%q63mCJ|Jh^kjrqi)13oqF+;{zibi?0zaB|+g6
zNEW9AgXx>tgucrh%vi+`@qD4k5uHvcPxApWaG#x`#T;Ba@bX|{ns~7rcrKXUEJ(Ca
ztFI<RmHbG`8mj2j&&MUt!$wFN|NLBJ=9afb4kOoAz?jX~ToW0SCOp!}37oU8gDZlT
zJm1CbKrL_NH&7QA;0fOpxOseH8g7*W*%Ue-M%fLsaP{!)afgY)U%+Kb;cQOJN#V<Z
z!!?T=i#lBb@52G#kX$oe7Uo5MALg@K9KW0eUod>KieadgbAGMu=s0tm2k!s9T~s%|
zIb53J9kEvZsCsA3SFvmSCpZ8rwRJpL_SID9d+5u;-fhQBrvT@PW4uazcVlzoX#3V?
zt-7(YG4H5`z53rv_-^CHZu|S{W^J8RQrGF`%a!`e6>MPl>LxoVqs=+E!rR#P&#SqI
k`2<p>tF8Lg7WV0tw7y%pyB4SrY>0QJVdXCX(Zk#EKmWZ<*#H0l

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b892cc62ac4..d2c08246518 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2830,6 +2830,14 @@ def test_parquet_reader_unsupported_page_encoding(datadir):
         cudf.read_parquet(fname)
 
 
+def test_parquet_reader_detect_bad_dictionary(datadir):
+    fname = datadir / "bad_dict.parquet"
+
+    # expect a failure when reading the whole file
+    with pytest.raises(RuntimeError):
+        cudf.read_parquet(fname)
+
+
 @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
 @pytest.mark.parametrize("force_nullable_schema", [True, False])
 def test_parquet_writer_schema_nullability(data, force_nullable_schema):

From 253f6a6d5b19387c05368e073954ff773b3d6a39 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 14:05:53 -0700
Subject: [PATCH 5/7] Refactor LogicalType for Parquet (#14264)

Continuation of #14097, this PR refactors the LogicalType struct to use the new way of treating unions defined in the parquet thrift (more enum like than struct like).

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14264
---
 .../io/parquet/compact_protocol_reader.cpp    |  95 +++--------
 .../io/parquet/compact_protocol_writer.cpp    |  81 +++++----
 cpp/src/io/parquet/page_decode.cuh            |   3 +-
 cpp/src/io/parquet/parquet.hpp                | 156 +++++++++++-------
 cpp/src/io/parquet/parquet_gpu.hpp            |  30 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu    |  13 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 104 ++++++------
 cpp/src/io/parquet/writer_impl.cu             | 107 +++++++-----
 cpp/tests/io/parquet_test.cpp                 |   7 +-
 9 files changed, 293 insertions(+), 303 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 1a345ee0750..5a2b8aa8f2a 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -339,61 +339,6 @@ struct parquet_field_struct_list : public parquet_field_list<T> {
   }
 };
 
-// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields
-// in a struct
-/**
- * @brief Functor to read a union member from CompactProtocolReader
- *
- * @tparam is_empty True if tparam `T` type is empty type, else false.
- *
- * @return True if field types mismatch or if the process of reading a
- * union member fails
- */
-template <typename T, bool is_empty = false>
-class ParquetFieldUnionFunctor : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      return !cpr->read(&val);
-    }
-  }
-};
-
-template <typename T>
-class ParquetFieldUnionFunctor<T, true> : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      cpr->skip_struct_field(field_type);
-      return false;
-    }
-  }
-};
-
-template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
-{
-  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
-}
-
 /**
  * @brief Functor to read a binary from CompactProtocolReader
  *
@@ -595,34 +540,38 @@ bool CompactProtocolReader::read(FileMetaData* f)
 
 bool CompactProtocolReader::read(SchemaElement* s)
 {
+  using optional_converted_type =
+    parquet_field_optional<ConvertedType, parquet_field_enum<ConvertedType>>;
+  using optional_logical_type =
+    parquet_field_optional<LogicalType, parquet_field_struct<LogicalType>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, s->type),
                             parquet_field_int32(2, s->type_length),
                             parquet_field_enum<FieldRepetitionType>(3, s->repetition_type),
                             parquet_field_string(4, s->name),
                             parquet_field_int32(5, s->num_children),
-                            parquet_field_enum<ConvertedType>(6, s->converted_type),
+                            optional_converted_type(6, s->converted_type),
                             parquet_field_int32(7, s->decimal_scale),
                             parquet_field_int32(8, s->decimal_precision),
                             parquet_field_optional<int32_t, parquet_field_int32>(9, s->field_id),
-                            parquet_field_struct(10, s->logical_type));
+                            optional_logical_type(10, s->logical_type));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(LogicalType* l)
 {
-  auto op =
-    std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING),
-                    ParquetFieldUnion(2, l->isset.MAP, l->MAP),
-                    ParquetFieldUnion(3, l->isset.LIST, l->LIST),
-                    ParquetFieldUnion(4, l->isset.ENUM, l->ENUM),
-                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),  // read the struct
-                    ParquetFieldUnion(6, l->isset.DATE, l->DATE),
-                    ParquetFieldUnion(7, l->isset.TIME, l->TIME),            //  read the struct
-                    ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP),  //  read the struct
-                    ParquetFieldUnion(10, l->isset.INTEGER, l->INTEGER),     //  read the struct
-                    ParquetFieldUnion(11, l->isset.UNKNOWN, l->UNKNOWN),
-                    ParquetFieldUnion(12, l->isset.JSON, l->JSON),
-                    ParquetFieldUnion(13, l->isset.BSON, l->BSON));
+  auto op = std::make_tuple(
+    parquet_field_union_enumerator(1, l->type),
+    parquet_field_union_enumerator(2, l->type),
+    parquet_field_union_enumerator(3, l->type),
+    parquet_field_union_enumerator(4, l->type),
+    parquet_field_union_struct<LogicalType::Type, DecimalType>(5, l->type, l->decimal_type),
+    parquet_field_union_enumerator(6, l->type),
+    parquet_field_union_struct<LogicalType::Type, TimeType>(7, l->type, l->time_type),
+    parquet_field_union_struct<LogicalType::Type, TimestampType>(8, l->type, l->timestamp_type),
+    parquet_field_union_struct<LogicalType::Type, IntType>(10, l->type, l->int_type),
+    parquet_field_union_enumerator(11, l->type),
+    parquet_field_union_enumerator(12, l->type),
+    parquet_field_union_enumerator(13, l->type));
   return function_builder(this, op);
 }
 
@@ -648,9 +597,9 @@ bool CompactProtocolReader::read(TimestampType* t)
 
 bool CompactProtocolReader::read(TimeUnit* u)
 {
-  auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS),
-                            ParquetFieldUnion(2, u->isset.MICROS, u->MICROS),
-                            ParquetFieldUnion(3, u->isset.NANOS, u->NANOS));
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type),
+                            parquet_field_union_enumerator(2, u->type),
+                            parquet_field_union_enumerator(3, u->type));
   return function_builder(this, op);
 }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 00810269d3c..fbeda7f1099 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_writer.hpp"
 
+#include <cudf/utilities/error.hpp>
+
 namespace cudf::io::parquet::detail {
 
 /**
@@ -46,13 +48,11 @@ size_t CompactProtocolWriter::write(DecimalType const& decimal)
 size_t CompactProtocolWriter::write(TimeUnit const& time_unit)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = time_unit.isset;
-  if (isset.MILLIS) {
-    c.field_struct(1, time_unit.MILLIS);
-  } else if (isset.MICROS) {
-    c.field_struct(2, time_unit.MICROS);
-  } else if (isset.NANOS) {
-    c.field_struct(3, time_unit.NANOS);
+  switch (time_unit.type) {
+    case TimeUnit::MILLIS:
+    case TimeUnit::MICROS:
+    case TimeUnit::NANOS: c.field_empty_struct(time_unit.type); break;
+    default: CUDF_FAIL("Trying to write an invalid TimeUnit " + std::to_string(time_unit.type));
   }
   return c.value();
 }
@@ -84,31 +84,29 @@ size_t CompactProtocolWriter::write(IntType const& integer)
 size_t CompactProtocolWriter::write(LogicalType const& logical_type)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = logical_type.isset;
-  if (isset.STRING) {
-    c.field_struct(1, logical_type.STRING);
-  } else if (isset.MAP) {
-    c.field_struct(2, logical_type.MAP);
-  } else if (isset.LIST) {
-    c.field_struct(3, logical_type.LIST);
-  } else if (isset.ENUM) {
-    c.field_struct(4, logical_type.ENUM);
-  } else if (isset.DECIMAL) {
-    c.field_struct(5, logical_type.DECIMAL);
-  } else if (isset.DATE) {
-    c.field_struct(6, logical_type.DATE);
-  } else if (isset.TIME) {
-    c.field_struct(7, logical_type.TIME);
-  } else if (isset.TIMESTAMP) {
-    c.field_struct(8, logical_type.TIMESTAMP);
-  } else if (isset.INTEGER) {
-    c.field_struct(10, logical_type.INTEGER);
-  } else if (isset.UNKNOWN) {
-    c.field_struct(11, logical_type.UNKNOWN);
-  } else if (isset.JSON) {
-    c.field_struct(12, logical_type.JSON);
-  } else if (isset.BSON) {
-    c.field_struct(13, logical_type.BSON);
+  switch (logical_type.type) {
+    case LogicalType::STRING:
+    case LogicalType::MAP:
+    case LogicalType::LIST:
+    case LogicalType::ENUM:
+    case LogicalType::DATE:
+    case LogicalType::UNKNOWN:
+    case LogicalType::JSON:
+    case LogicalType::BSON: c.field_empty_struct(logical_type.type); break;
+    case LogicalType::DECIMAL:
+      c.field_struct(LogicalType::DECIMAL, logical_type.decimal_type.value());
+      break;
+    case LogicalType::TIME:
+      c.field_struct(LogicalType::TIME, logical_type.time_type.value());
+      break;
+    case LogicalType::TIMESTAMP:
+      c.field_struct(LogicalType::TIMESTAMP, logical_type.timestamp_type.value());
+      break;
+    case LogicalType::INTEGER:
+      c.field_struct(LogicalType::INTEGER, logical_type.int_type.value());
+      break;
+    default:
+      CUDF_FAIL("Trying to write an invalid LogicalType " + std::to_string(logical_type.type));
   }
   return c.value();
 }
@@ -124,20 +122,15 @@ size_t CompactProtocolWriter::write(SchemaElement const& s)
   c.field_string(4, s.name);
 
   if (s.type == UNDEFINED_TYPE) { c.field_int(5, s.num_children); }
-  if (s.converted_type != UNKNOWN) {
-    c.field_int(6, s.converted_type);
+  if (s.converted_type.has_value()) {
+    c.field_int(6, s.converted_type.value());
     if (s.converted_type == DECIMAL) {
       c.field_int(7, s.decimal_scale);
       c.field_int(8, s.decimal_precision);
     }
   }
-  if (s.field_id) { c.field_int(9, s.field_id.value()); }
-  auto const isset = s.logical_type.isset;
-  // TODO: add handling for all logical types
-  // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
-  //    isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON)
-  //    {
-  if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); }
+  if (s.field_id.has_value()) { c.field_int(9, s.field_id.value()); }
+  if (s.logical_type.has_value()) { c.field_struct(10, s.logical_type.value()); }
   return c.value();
 }
 
@@ -223,9 +216,9 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
 size_t CompactProtocolWriter::write(ColumnOrder const& co)
 {
   CompactProtocolFieldWriter c(*this);
-  switch (co) {
-    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break;
-    default: break;
+  switch (co.type) {
+    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(co.type); break;
+    default: CUDF_FAIL("Trying to write an invalid ColumnOrder " + std::to_string(co.type));
   }
   return c.value();
 }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 7c866fd8b9e..ab1cc68923d 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1143,7 +1143,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
               units = cudf::timestamp_ms::period::den;
             } else if (s->col.converted_type == TIMESTAMP_MICROS) {
               units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+            } else if (s->col.logical_type.has_value() and
+                       s->col.logical_type->is_timestamp_nanos()) {
               units = cudf::timestamp_ns::period::den;
             }
             if (units and units != s->col.ts_clock_rate) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 1cd16ac6102..699cad89703 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -46,79 +46,98 @@ struct file_ender_s {
   uint32_t magic;
 };
 
-// thrift generated code simplified.
-struct StringType {};
-struct MapType {};
-struct ListType {};
-struct EnumType {};
+// thrift inspired code simplified.
 struct DecimalType {
   int32_t scale     = 0;
   int32_t precision = 0;
 };
-struct DateType {};
-
-struct MilliSeconds {};
-struct MicroSeconds {};
-struct NanoSeconds {};
-using TimeUnit_isset = struct TimeUnit_isset {
-  bool MILLIS{false};
-  bool MICROS{false};
-  bool NANOS{false};
-};
 
 struct TimeUnit {
-  TimeUnit_isset isset;
-  MilliSeconds MILLIS;
-  MicroSeconds MICROS;
-  NanoSeconds NANOS;
+  enum Type { UNDEFINED, MILLIS, MICROS, NANOS };
+  Type type;
 };
 
 struct TimeType {
   bool isAdjustedToUTC = false;
   TimeUnit unit;
 };
+
 struct TimestampType {
   bool isAdjustedToUTC = false;
   TimeUnit unit;
 };
+
 struct IntType {
   int8_t bitWidth = 0;
   bool isSigned   = false;
 };
-struct NullType {};
-struct JsonType {};
-struct BsonType {};
-
-// thrift generated code simplified.
-using LogicalType_isset = struct LogicalType_isset {
-  bool STRING{false};
-  bool MAP{false};
-  bool LIST{false};
-  bool ENUM{false};
-  bool DECIMAL{false};
-  bool DATE{false};
-  bool TIME{false};
-  bool TIMESTAMP{false};
-  bool INTEGER{false};
-  bool UNKNOWN{false};
-  bool JSON{false};
-  bool BSON{false};
-};
 
 struct LogicalType {
-  LogicalType_isset isset;
-  StringType STRING;
-  MapType MAP;
-  ListType LIST;
-  EnumType ENUM;
-  DecimalType DECIMAL;
-  DateType DATE;
-  TimeType TIME;
-  TimestampType TIMESTAMP;
-  IntType INTEGER;
-  NullType UNKNOWN;
-  JsonType JSON;
-  BsonType BSON;
+  enum Type {
+    UNDEFINED,
+    STRING,
+    MAP,
+    LIST,
+    ENUM,
+    DECIMAL,
+    DATE,
+    TIME,
+    TIMESTAMP,
+    // 9 is reserved
+    INTEGER = 10,
+    UNKNOWN,
+    JSON,
+    BSON
+  };
+  Type type;
+  thrust::optional<DecimalType> decimal_type;
+  thrust::optional<TimeType> time_type;
+  thrust::optional<TimestampType> timestamp_type;
+  thrust::optional<IntType> int_type;
+
+  LogicalType(Type tp = UNDEFINED) : type(tp) {}
+  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
+  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
+  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
+  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
+
+  constexpr bool is_time_millis() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_time_micros() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_time_nanos() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr bool is_timestamp_millis() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_timestamp_micros() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_timestamp_nanos() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; }
+
+  constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
+
+  constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; }
+
+  constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; }
 };
 
 /**
@@ -127,8 +146,6 @@ struct LogicalType {
 struct ColumnOrder {
   enum Type { UNDEFINED, TYPE_ORDER };
   Type type;
-
-  operator Type() const { return type; }
 };
 
 /**
@@ -138,18 +155,29 @@ struct ColumnOrder {
  * as a schema tree.
  */
 struct SchemaElement {
-  Type type                    = UNDEFINED_TYPE;
-  ConvertedType converted_type = UNKNOWN;
-  LogicalType logical_type;
-  int32_t type_length =
-    0;  // Byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  // 1: parquet physical type for output
+  Type type = UNDEFINED_TYPE;
+  // 2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  int32_t type_length = 0;
+  // 3: repetition of the field
   FieldRepetitionType repetition_type = REQUIRED;
-  std::string name                    = "";
-  int32_t num_children                = 0;
-  int32_t decimal_scale               = 0;
-  int32_t decimal_precision           = 0;
-  thrust::optional<int32_t> field_id  = thrust::nullopt;
-  bool output_as_byte_array           = false;
+  // 4: name of the field
+  std::string name = "";
+  // 5: nested fields
+  int32_t num_children = 0;
+  // 6: DEPRECATED: record the original type before conversion to parquet type
+  thrust::optional<ConvertedType> converted_type;
+  // 7: DEPRECATED: record the scale for DECIMAL converted type
+  int32_t decimal_scale = 0;
+  // 8: DEPRECATED: record the precision for DECIMAL converted type
+  int32_t decimal_precision = 0;
+  // 9: save field_id from original schema
+  thrust::optional<int32_t> field_id;
+  // 10: replaces converted type
+  thrust::optional<LogicalType> logical_type;
+
+  // extra cudf specific fields
+  bool output_as_byte_array = false;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 164e2cea2ed..68851e72663 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -313,7 +313,7 @@ struct ColumnChunkDesc {
                            uint8_t rep_level_bits_,
                            int8_t codec_,
                            int8_t converted_type_,
-                           LogicalType logical_type_,
+                           thrust::optional<LogicalType> logical_type_,
                            int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
@@ -355,20 +355,20 @@ struct ColumnChunkDesc {
   uint16_t data_type{};  // basic column data type, ((type_length << 3) |
                          // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
-  int32_t num_data_pages{};                     // number of data pages
-  int32_t num_dict_pages{};                     // number of dictionary pages
-  int32_t max_num_pages{};                      // size of page_info array
-  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
-                                                // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index{};          // index for string dictionary
-  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
-  void** column_data_base{};                    // base pointers of column data
-  void** column_string_base{};                  // base pointers of column string data
-  int8_t codec{};                               // compressed codec enum
-  int8_t converted_type{};                      // converted type enum
-  LogicalType logical_type{};                   // logical type
-  int8_t decimal_precision{};                   // Decimal precision
+    level_bits[level_type::NUM_LEVEL_TYPES]{};   // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                      // number of data pages
+  int32_t num_dict_pages{};                      // number of dictionary pages
+  int32_t max_num_pages{};                       // size of page_info array
+  PageInfo* page_info{};                         // output page info for up to num_dict_pages +
+                                                 // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};           // index for string dictionary
+  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
+  void** column_data_base{};                     // base pointers of column data
+  void** column_string_base{};                   // base pointers of column string data
+  int8_t codec{};                                // compressed codec enum
+  int8_t converted_type{};                       // converted type enum
+  thrust::optional<LogicalType> logical_type{};  // logical type
+  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index ad52a7dfcc1..213fc380a34 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -304,11 +304,12 @@ std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const&
  *
  * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(
+  type_id column_type_id,
+  type_id timestamp_type_id,
+  Type physical,
+  thrust::optional<ConvertedType> converted,
+  int32_t length)
 {
   int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
   int32_t clock_rate = 0;
@@ -322,7 +323,7 @@ std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const&
     clock_rate = to_clockrate(timestamp_type_id);
   }
 
-  int8_t converted_type = converted;
+  int8_t converted_type = converted.value_or(UNKNOWN);
   if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
       not cudf::is_fixed_point(data_type{column_type_id})) {
     converted_type = UNKNOWN;  // Not converting to float64 or decimal
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 040c6403f57..a9c84143e1a 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -25,44 +25,42 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-ConvertedType logical_type_to_converted_type(LogicalType const& logical)
+ConvertedType logical_type_to_converted_type(thrust::optional<LogicalType> const& logical)
 {
-  if (logical.isset.STRING) {
-    return UTF8;
-  } else if (logical.isset.MAP) {
-    return MAP;
-  } else if (logical.isset.LIST) {
-    return LIST;
-  } else if (logical.isset.ENUM) {
-    return ENUM;
-  } else if (logical.isset.DECIMAL) {
-    return DECIMAL;  // TODO set decimal values
-  } else if (logical.isset.DATE) {
-    return DATE;
-  } else if (logical.isset.TIME) {
-    if (logical.TIME.unit.isset.MILLIS)
-      return TIME_MILLIS;
-    else if (logical.TIME.unit.isset.MICROS)
-      return TIME_MICROS;
-  } else if (logical.isset.TIMESTAMP) {
-    if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return TIMESTAMP_MILLIS;
-    else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return TIMESTAMP_MICROS;
-  } else if (logical.isset.INTEGER) {
-    switch (logical.INTEGER.bitWidth) {
-      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
-      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
-      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
-      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
-      default: break;
-    }
-  } else if (logical.isset.UNKNOWN) {
-    return NA;
-  } else if (logical.isset.JSON) {
-    return JSON;
-  } else if (logical.isset.BSON) {
-    return BSON;
+  if (not logical.has_value()) { return UNKNOWN; }
+  switch (logical->type) {
+    case LogicalType::STRING: return UTF8;
+    case LogicalType::MAP: return MAP;
+    case LogicalType::LIST: return LIST;
+    case LogicalType::ENUM: return ENUM;
+    case LogicalType::DECIMAL: return DECIMAL;  // TODO use decimal scale/precision
+    case LogicalType::DATE: return DATE;
+    case LogicalType::TIME:
+      if (logical->is_time_millis()) {
+        return TIME_MILLIS;
+      } else if (logical->is_time_micros()) {
+        return TIME_MICROS;
+      }
+      break;
+    case LogicalType::TIMESTAMP:
+      if (logical->is_timestamp_millis()) {
+        return TIMESTAMP_MILLIS;
+      } else if (logical->is_timestamp_micros()) {
+        return TIMESTAMP_MICROS;
+      }
+      break;
+    case LogicalType::INTEGER:
+      switch (logical->bit_width()) {
+        case 8: return logical->is_signed() ? INT_8 : UINT_8;
+        case 16: return logical->is_signed() ? INT_16 : UINT_16;
+        case 32: return logical->is_signed() ? INT_32 : UINT_32;
+        case 64: return logical->is_signed() ? INT_64 : UINT_64;
+        default: break;
+      }
+    case LogicalType::UNKNOWN: return NA;
+    case LogicalType::JSON: return JSON;
+    case LogicalType::BSON: return BSON;
+    default: break;
   }
   return UNKNOWN;
 }
@@ -76,20 +74,20 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  Type const physical            = schema.type;
-  LogicalType const logical_type = schema.logical_type;
-  ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision      = schema.decimal_precision;
+  auto const physical       = schema.type;
+  auto const logical_type   = schema.logical_type;
+  auto converted_type       = schema.converted_type;
+  int32_t decimal_precision = schema.decimal_precision;
 
+  // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to
+  // converted_type if logical_type isn't set
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
   if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == DECIMAL) {
-    decimal_precision = schema.logical_type.DECIMAL.precision;
-  }
+  if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); }
 
-  switch (converted_type) {
+  switch (converted_type.value_or(UNKNOWN)) {
     case UINT_8: return type_id::UINT8;
     case INT_8: return type_id::INT8;
     case UINT_16: return type_id::UINT16;
@@ -140,15 +138,13 @@ type_id to_type_id(SchemaElement const& schema,
     default: break;
   }
 
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and
-      logical_type.TIMESTAMP.unit.isset.NANOS) {
-    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                 : type_id::TIMESTAMP_NANOSECONDS;
-  }
-
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and
-      logical_type.TIME.unit.isset.NANOS) {
-    return type_id::DURATION_NANOSECONDS;
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) {
+    if (logical_type->is_timestamp_nanos()) {
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    } else if (logical_type->is_time_nanos()) {
+      return type_id::DURATION_NANOSECONDS;
+    }
   }
 
   // is it simply a struct?
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 50589f23626..c06acc1690b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -284,6 +284,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::BOOLEAN;
     col_schema.stats_dtype = statistics_dtype::dtype_bool;
+    // BOOLEAN needs no converted or logical type
   }
 
   template <typename T>
@@ -292,6 +293,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, true}};
   }
 
   template <typename T>
@@ -300,6 +302,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, true}};
   }
 
   template <typename T>
@@ -307,6 +310,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT32;
     col_schema.stats_dtype = statistics_dtype::dtype_int32;
+    // INT32 needs no converted or logical type
   }
 
   template <typename T>
@@ -314,6 +318,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // INT64 needs no converted or logical type
   }
 
   template <typename T>
@@ -322,6 +327,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, false}};
   }
 
   template <typename T>
@@ -330,6 +336,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, false}};
   }
 
   template <typename T>
@@ -338,6 +345,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_32;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{IntType{32, false}};
   }
 
   template <typename T>
@@ -346,6 +354,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT64;
     col_schema.converted_type = ConvertedType::UINT_64;
     col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{IntType{64, false}};
   }
 
   template <typename T>
@@ -353,6 +362,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::FLOAT;
     col_schema.stats_dtype = statistics_dtype::dtype_float32;
+    // FLOAT needs no converted or logical type
   }
 
   template <typename T>
@@ -360,6 +370,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::DOUBLE;
     col_schema.stats_dtype = statistics_dtype::dtype_float64;
+    // DOUBLE needs no converted or logical type
   }
 
   template <typename T>
@@ -367,11 +378,12 @@ struct leaf_schema_fn {
   {
     col_schema.type = Type::BYTE_ARRAY;
     if (col_meta.is_enabled_output_as_binary()) {
-      col_schema.converted_type = ConvertedType::UNKNOWN;
-      col_schema.stats_dtype    = statistics_dtype::dtype_byte_array;
+      col_schema.stats_dtype = statistics_dtype::dtype_byte_array;
+      // BYTE_ARRAY needs no converted or logical type
     } else {
       col_schema.converted_type = ConvertedType::UTF8;
       col_schema.stats_dtype    = statistics_dtype::dtype_string;
+      col_schema.logical_type   = LogicalType{LogicalType::STRING};
     }
   }
 
@@ -381,49 +393,55 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::DATE;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{LogicalType::DATE};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
     col_schema.ts_scale    = 1000;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MICROS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = ConvertedType::UNKNOWN;
+    col_schema.converted_type = thrust::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
     }
     // set logical type if it's not int96
     else {
-      col_schema.logical_type.isset.TIMESTAMP            = true;
-      col_schema.logical_type.TIMESTAMP.unit.isset.NANOS = true;
+      col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::NANOS}};
     }
   }
 
@@ -431,53 +449,48 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 24 * 60 * 60 * 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 24 * 60 * 60 * 1000;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 1000;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type                                = Type::INT64;
-    col_schema.converted_type                      = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MICROS = true;
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MICROS}};
   }
 
   //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type                               = Type::INT64;
-    col_schema.stats_dtype                        = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME            = true;
-    col_schema.logical_type.TIME.unit.isset.NANOS = true;
+    col_schema.type         = Type::INT64;
+    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
+    col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::NANOS}};
   }
 
   template <typename T>
@@ -487,27 +500,32 @@ struct leaf_schema_fn {
       col_schema.type              = Type::INT32;
       col_schema.stats_dtype       = statistics_dtype::dtype_int32;
       col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type              = Type::INT64;
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
       col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
+    col_schema.logical_type->decimal_type->scale = -col->type().scale();
     if (col_meta.is_decimal_precision_set()) {
       CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
                    "Precision must be equal to or greater than scale!");
       if (col_schema.type == Type::INT64 and col_meta.get_decimal_precision() < 10) {
         CUDF_LOG_WARN("Parquet writer: writing a decimal column with precision < 10 as int64");
       }
-      col_schema.decimal_precision = col_meta.get_decimal_precision();
+      col_schema.decimal_precision                     = col_meta.get_decimal_precision();
+      col_schema.logical_type->decimal_type->precision = col_meta.get_decimal_precision();
     }
   }
 
@@ -593,7 +611,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         schema_tree_node col_schema{};
         col_schema.type            = Type::BYTE_ARRAY;
-        col_schema.converted_type  = ConvertedType::UNKNOWN;
+        col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -762,7 +780,10 @@ struct parquet_column_view {
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
   [[nodiscard]] Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] ConvertedType converted_type() const
+  {
+    return schema_node.converted_type.value_or(UNKNOWN);
+  }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 2a654bd7e8c..fece83f891b 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -4075,11 +4075,12 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cudf::io::parquet::detail::ConvertedType ctype)
+                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
+  auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
     case cudf::io::parquet::detail::INT32:
-      switch (ctype) {
+      switch (ctype_val) {
         case cudf::io::parquet::detail::UINT_8:
         case cudf::io::parquet::detail::UINT_16:
         case cudf::io::parquet::detail::UINT_32:
@@ -4091,7 +4092,7 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
       }
 
     case cudf::io::parquet::detail::INT64:
-      if (ctype == cudf::io::parquet::detail::UINT_64) {
+      if (ctype_val == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }

From 8ae3aab79e14f8af733879e1e4b62b75b0f62368 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 23 Oct 2023 10:43:33 -0700
Subject: [PATCH 6/7] Extract `debug_utilities.hpp/cu` from
 `column_utilities.hpp/cu` (#13720)

This PR extracts the implementation of the debug utility function `cudf::test::print()` from `column_utilities.hpp/cu` into its separate header/source files (`debug_utilities.hpp/cu`) for better organizing the relevant code. The new header name is also more expressive and more relevant to its purpose.

The changes in this PR are only moving code around. Not any new functionality or implementation was added.
Closes https://github.com/rapidsai/cudf/issues/13450 (although this is not to address that issue).

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13720
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf_test/column_utilities.hpp    |  33 --
 cpp/include/cudf_test/debug_utilities.hpp     |  47 ++
 .../cudf_test/detail/column_utilities.hpp     |  85 ----
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/structs_tests.cpp           |   1 +
 cpp/tests/utilities/column_utilities.cu       | 408 +--------------
 cpp/tests/utilities/debug_utilities.cu        | 480 ++++++++++++++++++
 .../utilities_tests/column_debug_tests.cpp    | 137 +++++
 .../column_utilities_tests.cpp                | 100 ----
 10 files changed, 674 insertions(+), 619 deletions(-)
 create mode 100644 cpp/include/cudf_test/debug_utilities.hpp
 delete mode 100644 cpp/include/cudf_test/detail/column_utilities.hpp
 create mode 100644 cpp/tests/utilities/debug_utilities.cu
 create mode 100644 cpp/tests/utilities_tests/column_debug_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f8b9762f1d4..472ee9d9fd4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -835,6 +835,7 @@ if(CUDF_BUILD_TESTUTIL)
     tests/io/metadata_utilities.cpp
     tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
+    tests/utilities/debug_utilities.cu
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 059bd10eae1..f6872fcdd6d 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -140,39 +140,6 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
  */
 void expect_column_empty(cudf::column_view const& col);
 
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter);
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size);
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * @param col The column view
- */
-std::vector<std::string> to_strings(cudf::column_view const& col);
-
-/**
- * @brief Print a column view to an ostream
- *
- * @param os        The output stream
- * @param col       The column view
- */
-void print(cudf::column_view const& col,
-           std::ostream& os             = std::cout,
-           std::string const& delimiter = ",");
-
 /**
  * @brief Copy the null bitmask from a column view to a host vector
  *
diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp
new file mode 100644
index 00000000000..a0881490b82
--- /dev/null
+++ b/cpp/include/cudf_test/debug_utilities.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+
+namespace cudf::test {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The input column view
+ * @param delimiter The delimiter to put between strings
+ */
+std::string to_string(cudf::column_view const& col, std::string const& delimiter);
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * @param col The input column view
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col);
+
+/**
+ * @brief Print a column view to an ostream
+ *
+ * @param col The input column view
+ * @param os The output stream
+ */
+void print(cudf::column_view const& col, std::ostream& os = std::cout);
+
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp
deleted file mode 100644
index f8270f61f10..00000000000
--- a/cpp/include/cudf_test/detail/column_utilities.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/null_mask.hpp>
-
-namespace cudf {
-namespace test {
-namespace detail {
-
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent = "");
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent = "");
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * Supports indentation of all output.  For example, if the displayed output of your column
- * would be
- *
- * @code{.pseudo}
- * "1,2,3,4,5"
- * @endcode
- * and the `indent` parameter was "   ", that indentation would be prepended to
- * result in the output
- * @code{.pseudo}
- * "   1,2,3,4,5"
- * @endcode
- *
- * The can be useful for displaying complex types. An example use case would be for
- * displaying the nesting of a LIST type column (via recursion).
- *
- *  List<List<int>>:
- *  Length : 3
- *  Offsets : 0, 2, 5, 6
- *  Children :
- *     List<int>:
- *     Length : 6
- *     Offsets : 0, 2, 4, 7, 8, 9, 11
- *     Children :
- *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
- *
- * @param col The column view
- * @param indent Indentation for all output
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
-
-}  // namespace detail
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 16e7239ebd8..eb0585d8f3e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -357,6 +357,7 @@ ConfigureTest(
 ConfigureTest(
   UTILITIES_TEST
   utilities_tests/type_list_tests.cpp
+  utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
   utilities_tests/lists_column_wrapper_tests.cpp
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
index f85fc6335f6..af6f613d344 100644
--- a/cpp/tests/groupby/structs_tests.cpp
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 620e0bfe8de..f54ea28d9b2 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -14,28 +14,24 @@
  * limitations under the License.
  */
 
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/default_stream.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/lists/list_view.hpp>
 #include <cudf/structs/struct_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
-#include <cudf_test/detail/column_utilities.hpp>
-
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
@@ -928,396 +924,6 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
   }
 }
 
-namespace {
-
-template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  return std::to_string(value);
-}
-
-template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  std::ostringstream o;
-  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
-  return o.str();
-}
-
-static auto duration_suffix(cudf::duration_D) { return " days"; }
-
-static auto duration_suffix(cudf::duration_s) { return " seconds"; }
-
-static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
-
-static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
-
-static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
-
-std::string get_nested_type_str(cudf::column_view const& view)
-{
-  if (view.type().id() == cudf::type_id::LIST) {
-    lists_column_view lcv(view);
-    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
-  }
-
-  if (view.type().id() == cudf::type_id::STRUCT) {
-    std::ostringstream out;
-
-    out << cudf::type_to_name(view.type()) + "<";
-    std::transform(view.child_begin(),
-                   view.child_end(),
-                   std::ostream_iterator<std::string>(out, ","),
-                   [&out](auto const col) { return get_nested_type_str(col); });
-    out << ">";
-    return out.str();
-  }
-
-  return cudf::type_to_name(view.type());
-}
-
-template <typename NestedColumnView>
-std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
-{
-  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
-  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
-               "Column does not appear to be an offsets column");
-  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
-  size_type output_size = c.size() + 1;
-
-  // the first offset value to normalize everything against
-  size_type first =
-    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::test::get_default_stream());
-  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::test::get_default_stream());
-
-  // normalize the offset values for the column offset
-  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
-  thrust::transform(
-    rmm::exec_policy(cudf::test::get_default_stream()),
-    d_offsets,
-    d_offsets + output_size,
-    shifted_offsets.begin(),
-    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
-
-  auto const h_shifted_offsets =
-    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::test::get_default_stream());
-  std::ostringstream buffer;
-  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
-    buffer << h_shifted_offsets[idx];
-    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
-  }
-  return buffer.str();
-}
-
-struct column_view_printer {
-  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    //  For timestamps, convert timestamp column to column of strings, then
-    //  call string version
-    std::string format = [&]() {
-      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
-      }
-      return std::string{"%Y-%m-%d"};
-    }();
-
-    auto col_as_strings = cudf::strings::from_timestamps(col, format);
-    if (col_as_strings->size() == 0) { return; }
-
-    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
-  }
-
-  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto const h_data = cudf::test::to_host<Element>(col);
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     std::back_inserter(out),
-                     [&h_data](auto idx) {
-                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                                ? static_cast<std::string>(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-    } else {
-      std::transform(std::cbegin(h_data.first),
-                     std::cend(h_data.first),
-                     std::back_inserter(out),
-                     [col](auto const& fp) { return static_cast<std::string>(fp); });
-    }
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    //
-    //  Implementation for strings, call special to_host variant
-    //
-    if (col.is_empty()) return;
-    auto h_data = cudf::test::to_host<std::string>(col);
-
-    // explicitly replace some special whitespace characters with their literal equivalents
-    auto cleaned = [](std::string_view in) {
-      std::string out(in);
-      auto replace_char = [](std::string& out, char c, std::string_view repl) {
-        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
-          out.replace(pos, 1, repl);
-        }
-      };
-      replace_char(out, '\a', "\\a");
-      replace_char(out, '\b', "\\b");
-      replace_char(out, '\f', "\\f");
-      replace_char(out, '\r', "\\r");
-      replace_char(out, '\t', "\\t");
-      replace_char(out, '\n', "\\n");
-      replace_char(out, '\v', "\\v");
-      return out;
-    };
-
-    out.resize(col.size());
-    std::transform(thrust::make_counting_iterator(size_type{0}),
-                   thrust::make_counting_iterator(col.size()),
-                   out.begin(),
-                   [&](auto idx) {
-                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                              ? cleaned(h_data.first[idx])
-                              : std::string("NULL");
-                   });
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    cudf::dictionary_column_view dictionary(col);
-    if (col.is_empty()) return;
-    std::vector<std::string> keys    = to_strings(dictionary.keys());
-    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
-                                                   dictionary.size(),
-                                                   dictionary.indices().head(),
-                                                   dictionary.null_mask(),
-                                                   dictionary.null_count(),
-                                                   dictionary.offset()});
-    out.insert(out.end(), keys.begin(), keys.end());
-    if (!indices.empty()) {
-      std::string first = "\x08 : " + indices.front();  // use : as delimiter
-      out.push_back(first);                             // between keys and indices
-      out.insert(out.end(), indices.begin() + 1, indices.end());
-    }
-  }
-
-  // Print the tick counts with the units
-  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx].count()) +
-                                    duration_suffix(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el.count()) + duration_suffix(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    lists_column_view lcv(col);
-
-    // propagate slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(cudf::test::get_default_stream());
-    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
-
-    std::string tmp =
-      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
-      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
-      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
-      (lcv.parent().nullable()
-         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
-             detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n"
-         : "") +
-      // non-nested types don't typically display their null masks, so do it here for convenience.
-      (!is_nested(child.type()) && child.nullable()
-         ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-         : "") +
-      (detail::to_string(child, ", ", indent + "   ")) + "\n";
-
-    out.push_back(tmp);
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    structs_column_view view{col};
-
-    std::ostringstream out_stream;
-
-    out_stream << get_nested_type_str(col) << ":\n"
-               << indent << "Length : " << view.size() << ":\n";
-    if (view.nullable()) {
-      out_stream << indent << "Null count: " << view.null_count() << "\n"
-                 << detail::to_string(bitmask_to_host(col), col.size(), indent) << "\n";
-    }
-
-    auto iter = thrust::make_counting_iterator(0);
-    std::transform(
-      iter,
-      iter + view.num_children(),
-      std::ostream_iterator<std::string>(out_stream, "\n"),
-      [&](size_type index) {
-        auto child = view.get_sliced_child(index, cudf::test::get_default_stream());
-
-        // non-nested types don't typically display their null masks, so do it here for convenience.
-        return (!is_nested(child.type()) && child.nullable()
-                  ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-                  : "") +
-               detail::to_string(child, ", ", indent + "   ");
-      });
-
-    out.push_back(out_stream.str());
-  }
-};
-
-}  // namespace
-
-namespace detail {
-
-/**
- * @copydoc cudf::test::detail::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
-{
-  std::vector<std::string> reply;
-  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
-  return reply;
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
- *
- * @param indent Indentation for all output
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  std::vector<std::string> h_data = to_strings(col, indent);
-
-  buffer << indent;
-  std::copy(h_data.begin(),
-            h_data.end() - (!h_data.empty()),
-            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
-  if (!h_data.empty()) buffer << h_data.back();
-
-  return buffer.str();
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
- *
- * @param indent Indentation for all output.  See comment in `to_strings` for
- * a detailed description.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  buffer << indent;
-  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
-    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
-  }
-  return buffer.str();
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::test::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col)
-{
-  return detail::to_strings(col);
-}
-
-/**
- * @copydoc cudf::test::to_string(cudf::column_view, std::string)
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter)
-{
-  return detail::to_string(col, delimiter);
-}
-
-/**
- * @copydoc cudf::test::to_string(std::vector<bitmask_type>, size_type)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
-{
-  return detail::to_string(null_mask, null_mask_size);
-}
-
-/**
- * @copydoc cudf::test::print
- */
-void print(cudf::column_view const& col, std::ostream& os, std::string const& delimiter)
-{
-  os << to_string(col, delimiter) << std::endl;
-}
-
 /**
  * @copydoc cudf::test::validate_host_masks
  */
diff --git a/cpp/tests/utilities/debug_utilities.cu b/cpp/tests/utilities/debug_utilities.cu
new file mode 100644
index 00000000000..a8a43ffb4ca
--- /dev/null
+++ b/cpp/tests/utilities/debug_utilities.cu
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/debug_utilities.hpp>
+
+#include <cudf/detail/get_value.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <iomanip>
+#include <sstream>
+
+namespace cudf::test {
+
+// Forward declaration.
+namespace detail {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The column view
+ * @param delimiter The delimiter to put between strings
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent = "");
+
+/**
+ * @brief Formats a null mask as a string
+ *
+ * @param null_mask The null mask buffer
+ * @param null_mask_size Size of the null mask (in rows)
+ * @param indent Indentation for all output
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent = "");
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * Supports indentation of all output.  For example, if the displayed output of your column
+ * would be
+ *
+ * @code{.pseudo}
+ * "1,2,3,4,5"
+ * @endcode
+ * and the `indent` parameter was "   ", that indentation would be prepended to
+ * result in the output
+ * @code{.pseudo}
+ * "   1,2,3,4,5"
+ * @endcode
+ *
+ * The can be useful for displaying complex types. An example use case would be for
+ * displaying the nesting of a LIST type column (via recursion).
+ *
+ *  List<List<int>>:
+ *  Length : 3
+ *  Offsets : 0, 2, 5, 6
+ *  Children :
+ *     List<int>:
+ *     Length : 6
+ *     Offsets : 0, 2, 4, 7, 8, 9, 11
+ *     Children :
+ *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
+ *
+ * @param col The column view
+ * @param indent Indentation for all output
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
+
+}  // namespace detail
+
+namespace {
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  return std::to_string(value);
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  std::ostringstream o;
+  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+  return o.str();
+}
+
+static auto duration_suffix(cudf::duration_D) { return " days"; }
+
+static auto duration_suffix(cudf::duration_s) { return " seconds"; }
+
+static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
+
+static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
+
+static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
+
+std::string get_nested_type_str(cudf::column_view const& view)
+{
+  if (view.type().id() == cudf::type_id::LIST) {
+    lists_column_view lcv(view);
+    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
+  }
+
+  if (view.type().id() == cudf::type_id::STRUCT) {
+    std::ostringstream out;
+
+    out << cudf::type_to_name(view.type()) + "<";
+    std::transform(view.child_begin(),
+                   view.child_end(),
+                   std::ostream_iterator<std::string>(out, ","),
+                   [&out](auto const col) { return get_nested_type_str(col); });
+    out << ">";
+    return out.str();
+  }
+
+  return cudf::type_to_name(view.type());
+}
+
+template <typename NestedColumnView>
+std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
+{
+  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
+  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
+               "Column does not appear to be an offsets column");
+  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
+  size_type output_size = c.size() + 1;
+
+  // the first offset value to normalize everything against
+  size_type first =
+    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::get_default_stream());
+  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::get_default_stream());
+
+  // normalize the offset values for the column offset
+  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
+  thrust::transform(
+    rmm::exec_policy(cudf::get_default_stream()),
+    d_offsets,
+    d_offsets + output_size,
+    shifted_offsets.begin(),
+    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
+
+  auto const h_shifted_offsets =
+    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream());
+  std::ostringstream buffer;
+  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
+    buffer << h_shifted_offsets[idx];
+    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
+  }
+  return buffer.str();
+}
+
+struct column_view_printer {
+  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    //  For timestamps, convert timestamp column to column of strings, then
+    //  call string version
+    std::string format = [&]() {
+      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
+      }
+      return std::string{"%Y-%m-%d"};
+    }();
+
+    auto col_as_strings = cudf::strings::from_timestamps(col, format);
+    if (col_as_strings->size() == 0) { return; }
+
+    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
+  }
+
+  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto const h_data = cudf::test::to_host<Element>(col);
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     std::back_inserter(out),
+                     [&h_data](auto idx) {
+                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                                ? static_cast<std::string>(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+    } else {
+      std::transform(std::cbegin(h_data.first),
+                     std::cend(h_data.first),
+                     std::back_inserter(out),
+                     [col](auto const& fp) { return static_cast<std::string>(fp); });
+    }
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    //
+    //  Implementation for strings, call special to_host variant
+    //
+    if (col.is_empty()) return;
+    auto h_data = cudf::test::to_host<std::string>(col);
+
+    // explicitly replace some special whitespace characters with their literal equivalents
+    auto cleaned = [](std::string_view in) {
+      std::string out(in);
+      auto replace_char = [](std::string& out, char c, std::string_view repl) {
+        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
+          out.replace(pos, 1, repl);
+        }
+      };
+      replace_char(out, '\a', "\\a");
+      replace_char(out, '\b', "\\b");
+      replace_char(out, '\f', "\\f");
+      replace_char(out, '\r', "\\r");
+      replace_char(out, '\t', "\\t");
+      replace_char(out, '\n', "\\n");
+      replace_char(out, '\v', "\\v");
+      return out;
+    };
+
+    out.resize(col.size());
+    std::transform(thrust::make_counting_iterator(size_type{0}),
+                   thrust::make_counting_iterator(col.size()),
+                   out.begin(),
+                   [&](auto idx) {
+                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                              ? cleaned(h_data.first[idx])
+                              : std::string("NULL");
+                   });
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    cudf::dictionary_column_view dictionary(col);
+    if (col.is_empty()) return;
+    std::vector<std::string> keys    = to_strings(dictionary.keys());
+    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
+                                                   dictionary.size(),
+                                                   dictionary.indices().head(),
+                                                   dictionary.null_mask(),
+                                                   dictionary.null_count(),
+                                                   dictionary.offset()});
+    out.insert(out.end(), keys.begin(), keys.end());
+    if (!indices.empty()) {
+      std::string first = "\x08 : " + indices.front();  // use : as delimiter
+      out.push_back(first);                             // between keys and indices
+      out.insert(out.end(), indices.begin() + 1, indices.end());
+    }
+  }
+
+  // Print the tick counts with the units
+  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx].count()) +
+                                    duration_suffix(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el.count()) + duration_suffix(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    lists_column_view lcv(col);
+
+    // propagate slicing to the child if necessary
+    column_view child    = lcv.get_sliced_child(cudf::get_default_stream());
+    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
+
+    std::string tmp =
+      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
+      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
+      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
+      (lcv.parent().nullable()
+         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
+             detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) + "\n"
+         : "") +
+      // non-nested types don't typically display their null masks, so do it here for convenience.
+      (!is_nested(child.type()) && child.nullable()
+         ? "   " + detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+             "\n"
+         : "") +
+      (detail::to_string(child, ", ", indent + "   ")) + "\n";
+
+    out.push_back(tmp);
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    structs_column_view view{col};
+
+    std::ostringstream out_stream;
+
+    out_stream << get_nested_type_str(col) << ":\n"
+               << indent << "Length : " << view.size() << ":\n";
+    if (view.nullable()) {
+      out_stream << indent << "Null count: " << view.null_count() << "\n"
+                 << detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) << "\n";
+    }
+
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(
+      iter,
+      iter + view.num_children(),
+      std::ostream_iterator<std::string>(out_stream, "\n"),
+      [&](size_type index) {
+        auto child = view.get_sliced_child(index, cudf::get_default_stream());
+
+        // non-nested types don't typically display their null masks, so do it here for convenience.
+        return (!is_nested(child.type()) && child.nullable()
+                  ? "   " +
+                      detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+                      "\n"
+                  : "") +
+               detail::to_string(child, ", ", indent + "   ");
+      });
+
+    out.push_back(out_stream.str());
+  }
+};
+
+}  // namespace
+
+namespace detail {
+
+/**
+ * @copydoc cudf::test::detail::to_strings
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
+{
+  std::vector<std::string> reply;
+  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
+  return reply;
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
+ *
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  std::vector<std::string> h_data = to_strings(col, indent);
+
+  buffer << indent;
+  std::copy(h_data.begin(),
+            h_data.end() - (!h_data.empty()),
+            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
+  if (!h_data.empty()) buffer << h_data.back();
+
+  return buffer.str();
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
+ *
+ * @param indent Indentation for all output.  See comment in `to_strings` for
+ * a detailed description.
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  buffer << indent;
+  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
+    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
+  }
+  return buffer.str();
+}
+
+}  // namespace detail
+
+std::vector<std::string> to_strings(cudf::column_view const& col)
+{
+  return detail::to_strings(col);
+}
+
+std::string to_string(cudf::column_view const& col, std::string const& delimiter)
+{
+  return detail::to_string(col, delimiter);
+}
+
+std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
+{
+  return detail::to_string(null_mask, null_mask_size);
+}
+
+void print(cudf::column_view const& col, std::ostream& os)
+{
+  os << to_string(col, ",") << std::endl;
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
new file mode 100644
index 00000000000..0dae407ad21
--- /dev/null
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <type_traits>
+
+template <typename T>
+struct ColumnDebugTestIntegral : public cudf::test::BaseFixture {};
+template <typename T>
+struct ColumnDebugTestFloatingPoint : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(ColumnDebugTestIntegral, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(ColumnDebugTestFloatingPoint, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::stringstream tmp;
+  auto string_iter =
+    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
+
+  std::copy(string_iter,
+            string_iter + std_col.size() - 1,
+            std::ostream_iterator<std::string>(tmp, delimiter));
+
+  tmp << std::to_string(std_col.back());
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::ostringstream tmp;
+  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[4]);
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
+                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
+                    : "10001523,NULL,3.75,NULL,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+struct ColumnDebugStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(ColumnDebugStringsTest, PrintColumnDuration)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
+
+  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TEST_F(ColumnDebugStringsTest, StringsToString)
+{
+  char const* delimiter = ",";
+
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+
+  std::ostringstream tmp;
+  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
+      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
+      << h_strings[6];
+
+  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
+}
+
+TEST_F(ColumnDebugStringsTest, PrintEscapeStrings)
+{
+  char const* delimiter = ",";
+  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
+  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
+  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
+}
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 90a7270cb29..07d2bea2b28 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -182,106 +182,6 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls)
   EXPECT_TRUE(std::all_of(results.begin(), results.end(), [](auto s) { return s.empty(); }));
 }
 
-TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
-
-  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::stringstream tmp;
-  auto string_iter =
-    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
-
-  std::copy(string_iter,
-            string_iter + std_col.size() - 1,
-            std::ostream_iterator<std::string>(tmp, delimiter));
-
-  tmp << std::to_string(std_col.back());
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::ostringstream tmp;
-  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[4]);
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
-                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
-                    : "10001523,NULL,3.75,NULL,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TEST_F(ColumnUtilitiesStringsTest, StringsToString)
-{
-  char const* delimiter = ",";
-
-  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::ostringstream tmp;
-  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
-      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
-      << h_strings[6];
-
-  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
-}
-
-TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings)
-{
-  char const* delimiter = ",";
-  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
-  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
-  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
-}
-
 TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost)
 {
   using namespace numeric;

From e8cf0ebd517a9c81e294771a40c74ff5c4fb42da Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Oct 2023 15:05:26 -0400
Subject: [PATCH 7/7] Expose stream parameter in public strings combine APIs
 (#14281)

Add stream parameter to public APIs:

- `cudf::strings::concatenate()` (x2)
- `cudf::strings::join_strings()`
- `cudf::strings::join_list_elements()` (x2)
- `cudf::strings::repeat_string()`
- `cudf::strings::repeat_strings()` (x2)

Also added stream gtests and fixed up some doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14281
---
 cpp/include/cudf/strings/combine.hpp          | 88 ++++++++++--------
 cpp/include/cudf/strings/repeat_strings.hpp   |  8 +-
 cpp/src/strings/combine/concatenate.cu        | 14 +--
 cpp/src/strings/combine/join.cu               |  3 +-
 cpp/src/strings/combine/join_list_elements.cu | 13 +--
 cpp/src/strings/repeat_strings.cu             | 11 ++-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/strings/combine_test.cpp    | 93 +++++++++++++++++++
 8 files changed, 168 insertions(+), 63 deletions(-)
 create mode 100644 cpp/tests/streams/strings/combine_test.cpp

diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 71f65ac9080..568e8ac50ec 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -66,18 +66,20 @@ enum class output_if_empty_list {
  *
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param strings Strings for this operation.
+ * @param input Strings for this operation
  * @param separator String that should inserted between each string.
  *        Default is an empty string.
- * @param narep String that should represent any null strings found.
+ * @param narep String to replace any null strings found.
  *        Default of invalid-scalar will ignore any null entries.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column containing one string.
  */
 std::unique_ptr<column> join_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -127,18 +129,17 @@ std::unique_ptr<column> join_strings(
  * @throw cudf::logic_error if the number of rows from @p separators and @p strings_columns
  *                          do not match
  *
- * @param strings_columns List of strings columns to concatenate.
+ * @param strings_columns List of strings columns to concatenate
  * @param separators Strings column that provides the separator for a given row
- * @param separator_narep String that should be used in place of a null separator for a given
- *        row. Default of invalid-scalar means no row separator value replacements.
- *        Default is an invalid string.
- * @param col_narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means no null column value replacements.
- *        Default is an invalid string.
+ * @param separator_narep String to replace a null separator for a given row.
+ *        Default of invalid-scalar means no row separator value replacements.
+ * @param col_narep String that should be used in place of any null strings found in any column.
+ *        Default of invalid-scalar means no null column value replacements.
  * @param separate_nulls If YES, then the separator is included for null rows
  *        if `col_narep` is valid.
- * @param mr Resource for allocating device memory.
- * @return New column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Resource for allocating device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
@@ -146,6 +147,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -184,21 +186,23 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if separator is not valid.
  * @throw cudf::logic_error if only one column is specified
  *
- * @param strings_columns List of string columns to concatenate.
+ * @param strings_columns List of string columns to concatenate
  * @param separator String that should inserted between each string from each row.
  *        Default is an empty string.
- * @param narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means any null entry in any column will
+ * @param narep String to replace any null strings found in any column.
+ *        Default of invalid-scalar means any null entry in any column will
  *        produces a null result for that row.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with concatenated results.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
   separator_on_nulls separate_nulls   = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -243,19 +247,20 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do
  *        not match
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separators Strings column that provides separators for concatenation.
- * @param separator_narep String that should be used to replace null separator, default is an
- *        invalid-scalar denoting that rows containing null separator will result in null string in
- *        the corresponding output rows.
- * @param string_narep String that should be used to replace null strings in any non-null list row,
- *        default is an invalid-scalar denoting that list rows containing null strings will result
- *        in null string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separators Strings column that provides separators for concatenation
+ * @param separator_narep String that should be used to replace a null separator.
+ *        Default is an invalid-scalar denoting that rows containing null separator will result in
+ *        a null string in the corresponding output rows.
+ * @param string_narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will
  *        result in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -264,6 +269,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& string_narep      = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -303,17 +309,18 @@ std::unique_ptr<column> join_list_elements(
  * @throw cudf::logic_error if input column is not lists of strings column.
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separator String that should inserted between strings of each list row, default is an
- *        empty string.
- * @param narep String that should be used to replace null strings in any non-null list row, default
- *        is an invalid-scalar denoting that list rows containing null strings will result in null
- *        string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separator String to insert between strings of each list row.
+ *        Default is an empty string.
+ * @param narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will result
  *        in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -321,6 +328,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& narep             = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 2b6575f80d0..7dc9c33f579 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -52,12 +52,14 @@ namespace strings {
  *
  * @param input The scalar containing the string to repeat
  * @param repeat_times The number of times the input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned string scalar
  * @return New string scalar in which the input string is repeated
  */
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,12 +83,14 @@ std::unique_ptr<string_scalar> repeat_string(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The number of times each input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,13 +119,15 @@ std::unique_ptr<column> repeat_strings(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The column containing numbers of times that the corresponding input strings
- *                     are repeated
+ *                     for each row are repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings.
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index ba8acd23467..0a11b6dc460 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -267,11 +267,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(
-    strings_columns, separator, narep, separate_nulls, cudf::get_default_stream(), mr);
+  return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
@@ -279,16 +279,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns,
-                             separators,
-                             separator_narep,
-                             col_narep,
-                             separate_nulls,
-                             cudf::get_default_stream(),
-                             mr);
+  return detail::concatenate(
+    strings_columns, separators, separator_narep, col_narep, separate_nulls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index faf1be6a26f..9ab527feaf8 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -180,10 +180,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, cudf::get_default_stream(), mr);
+  return detail::join_strings(strings, separator, narep, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index eee59e37478..372b49fb0ee 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -301,16 +301,12 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_list_elements(lists_strings_column,
-                                    separator,
-                                    narep,
-                                    separate_nulls,
-                                    empty_list_policy,
-                                    cudf::get_default_stream(),
-                                    mr);
+  return detail::join_list_elements(
+    lists_strings_column, separator, narep, separate_nulls, empty_list_policy, stream, mr);
 }
 
 std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
@@ -319,6 +315,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& string_narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -328,7 +325,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                     string_narep,
                                     separate_nulls,
                                     empty_list_policy,
-                                    cudf::get_default_stream(),
+                                    stream,
                                     mr);
 }
 
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 396e1e6a2ac..847a64f5602 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -67,7 +67,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                       return in_ptr[idx % str_size];
                     });
 
-  return std::make_unique<string_scalar>(std::move(buff));
+  return std::make_unique<string_scalar>(std::move(buff), true, stream, mr);
 }
 
 namespace {
@@ -260,26 +260,29 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_string(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_string(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eb0585d8f3e..e7f4914fe05 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -637,6 +637,7 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
+  streams/strings/combine_test.cpp
   streams/strings/convert_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
diff --git a/cpp/tests/streams/strings/combine_test.cpp b/cpp/tests/streams/strings/combine_test.cpp
new file mode 100644
index 00000000000..9562634957a
--- /dev/null
+++ b/cpp/tests/streams/strings/combine_test.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+
+#include <string>
+
+class StringsCombineTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsCombineTest, Concatenate)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::table_view({input, input});
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::concatenate(view, separator, narep, sep_on_null, cudf::test::get_default_stream());
+  cudf::strings::concatenate(
+    view, separators_view, narep, narep, sep_on_null, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Join)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_strings(view, separator, narep, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, JoinLists)
+{
+  using STR_LISTS  = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto const input = STR_LISTS{
+    STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{"ddd", "efgh", "ijk"}, STR_LISTS{"zzz", "xxxxx"}};
+  auto view = cudf::lists_column_view(input);
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+  auto if_empty        = cudf::strings::output_if_empty_list::EMPTY_STRING;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separator, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separators_view, narep, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Repeat)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+  cudf::strings::repeat_strings(view, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 10, cudf::test::get_default_stream());
+
+  auto counts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({9, 8, 7});
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+
+  auto const str = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 10, cudf::test::get_default_stream());
+
+  auto const invalid = cudf::string_scalar("", false, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(invalid, 10, cudf::test::get_default_stream());
+}