diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d9f3824f706..0742d039092 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -392,6 +392,7 @@ add_library(
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
+  src/io/parquet/page_string_decode.cu
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
   src/io/parquet/reader_impl_helpers.cpp
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index c2901dc61ee..28710f2a745 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -785,6 +785,25 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
   return out_dtypes;
 }
 
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num)
+{
+  std::vector<cudf::type_id> out_dtypes;
+  out_dtypes.reserve(num_cols);
+  for (cudf::size_type col = 0; col < first_num; ++col)
+    out_dtypes.push_back(dtype_ids.first);
+  for (cudf::size_type col = first_num; col < num_cols; ++col)
+    out_dtypes.push_back(dtype_ids.second);
+  return out_dtypes;
+}
+
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& profile,
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 8a5811218d0..a2efdb819bf 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -666,6 +666,21 @@ std::unique_ptr<cudf::table> create_sequence_table(
  */
 std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
                                         cudf::size_type num_cols);
+
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ *
+ * @param dtype_ids Pair of requested column types
+ * @param num_cols Total number of columns in the output vector
+ * @param first_num Total number of columns of type `dtype_ids.first`
+ * @return A vector of type_ids
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num);
 /**
  * @brief Create a random null mask object
  *
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index a8d40492890..9d1026f80fc 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -114,6 +114,38 @@ void BM_parquet_read_io_compression(
   parquet_read_common(write_opts, source_sink, state);
 }
 
+template <cudf::io::io_type IOType>
+void BM_parquet_read_io_small_mixed(nvbench::state& state,
+                                    nvbench::type_list<nvbench::enum_type<IOType>>)
+{
+  auto const d_type =
+    std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};
+
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_strings = state.get_int64("num_string_cols");
+  auto const source_type            = IOType;
+
+  // want 80 pages total, across 4 columns, so 20 pages per column
+  cudf::size_type constexpr n_col          = 4;
+  cudf::size_type constexpr page_size_rows = 10'000;
+  cudf::size_type constexpr num_rows       = page_size_rows * (80 / n_col);
+
+  auto const tbl =
+    create_random_table(mix_dtypes(d_type, n_col, num_strings),
+                        row_count{num_rows},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .max_page_size_rows(10'000)
+      .compression(cudf::io::compression_type::NONE);
+
+  parquet_read_common(write_opts, source_sink, state);
+}
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_parquet_read_chunks(
   nvbench::state& state,
@@ -203,3 +235,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("byte_limit", {0, 500'000});
+
+NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
+  .set_name("parquet_read_io_small_mixed")
+  .set_type_axes_names({"io"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("num_string_cols", {1, 2, 3});
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b9a012ab3a9..28ff4e27321 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -522,13 +522,13 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
                            };
                          });
 
-        out_buffers[col_idx]._null_mask = std::move(merged_null_mask);
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
 
       } else {
         // Since child column doesn't have a mask, copy parent null mask
         auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx]._null_mask =
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
       }
     }
   }
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 5fa7d924f82..e49378485fc 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,31 +14,11 @@
  * limitations under the License.
  */
 
-#include "parquet_gpu.hpp"
-#include "rle_stream.cuh"
-#include <io/utilities/block_utils.cuh>
+#include "page_decode.cuh"
+
 #include <io/utilities/column_buffer.hpp>
 
-#include <cuda/std/tuple>
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/strings/string_view.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
 
 namespace cudf {
 namespace io {
@@ -47,548 +27,6 @@ namespace gpu {
 
 namespace {
 
-constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
-constexpr int decode_block_size     = 128;
-constexpr int non_zero_buffer_size  = decode_block_size * 2;
-constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
-template <int lvl_buf_size>
-constexpr int rolling_lvl_index(int index)
-{
-  return index % lvl_buf_size;
-}
-
-struct page_state_s {
-  uint8_t const* data_start;
-  uint8_t const* data_end;
-  uint8_t const* lvl_end;
-  uint8_t const* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
-
-  // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
-
-  // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
-
-  // a shared-memory cache of frequently used data when decoding. The source of this data is
-  // normally stored in global memory which can yield poor performance. So, when possible
-  // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
-  // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
-};
-
-// buffers only used in the decode kernel.  separated from page_state_s to keep
-// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
-struct page_state_buffers_s {
-  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
-  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
-  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
-};
-
-/**
- * @brief Returns whether or not a page spans either the beginning or the end of the
- * specified row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page spans the beginning or the end of the row bounds
- */
-inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end));
-}
-
-/**
- * @brief Returns whether or not a page is completely contained within the specified
- * row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page is completely contained within the row bounds
- */
-inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return page_begin >= begin && page_end <= end;
-}
-
-/**
- * @brief Parse the beginning of the level section (definition or repetition),
- * initializes the initial RLE run & value, and returns the section length
- *
- * @param[in,out] s The page state
- * @param[in] cur The current data position
- * @param[in] end The end of the data
- * @param[in] level_bits The bits required
- * @param[in] is_decode_step True if we are performing the decode step.
- * @param[in,out] decoders The repetition and definition level stream decoders
- *
- * @return The length of the section
- */
-__device__ uint32_t InitLevelSection(page_state_s* s,
-                                     uint8_t const* cur,
-                                     uint8_t const* end,
-                                     level_type lvl)
-{
-  int32_t len;
-  int level_bits    = s->col.level_bits[lvl];
-  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
-                                                    : s->page.repetition_level_encoding;
-
-  auto start = cur;
-  if (level_bits == 0) {
-    len                       = 0;
-    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-    s->abs_lvl_start[lvl]     = cur;
-  } else if (encoding == Encoding::RLE) {
-    // V2 only uses RLE encoding, so only perform check here
-    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
-      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
-    } else if (cur + 4 < end) {
-      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
-      cur += 4;
-    } else {
-      len      = 0;
-      s->error = 2;
-    }
-    s->abs_lvl_start[lvl] = cur;
-    if (!s->error) {
-      uint32_t run            = get_vlq32(cur, end);
-      s->initial_rle_run[lvl] = run;
-      if (!(run & 1)) {
-        int v = (cur < end) ? cur[0] : 0;
-        cur++;
-        if (level_bits > 8) {
-          v |= ((cur < end) ? cur[0] : 0) << 8;
-          cur++;
-        }
-        s->initial_rle_value[lvl] = v;
-      }
-      s->lvl_start[lvl] = cur;
-    }
-
-    if (cur > end) { s->error = 2; }
-  } else if (encoding == Encoding::BIT_PACKED) {
-    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
-    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-    s->abs_lvl_start[lvl]     = cur;
-  } else {
-    s->error = 3;
-    len      = 0;
-  }
-
-  s->abs_lvl_end[lvl] = start + len;
-
-  return static_cast<uint32_t>(len);
-}
-
-/**
- * @brief Decode values out of a definition or repetition stream
- *
- * @param[in,out] s Page state input/output
- * @param[in] t target_count Target count of stream values on output
- * @param[in] t Warp0 thread ID (0..31)
- * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
- */
-template <typename level_t>
-__device__ void gpuDecodeStream(
-  level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
-{
-  uint8_t const* cur_def    = s->lvl_start[lvl];
-  uint8_t const* end        = s->lvl_end;
-  uint32_t level_run        = s->initial_rle_run[lvl];
-  int32_t level_val         = s->initial_rle_value[lvl];
-  int level_bits            = s->col.level_bits[lvl];
-  int32_t num_input_values  = s->num_input_values;
-  int32_t value_count       = s->lvl_count[lvl];
-  int32_t batch_coded_count = 0;
-
-  while (value_count < target_count && value_count < num_input_values) {
-    int batch_len;
-    if (level_run <= 1) {
-      // Get a new run symbol from the byte stream
-      int sym_len = 0;
-      if (!t) {
-        uint8_t const* cur = cur_def;
-        if (cur < end) { level_run = get_vlq32(cur, end); }
-        if (!(level_run & 1)) {
-          if (cur < end) level_val = cur[0];
-          cur++;
-          if (level_bits > 8) {
-            if (cur < end) level_val |= cur[0] << 8;
-            cur++;
-          }
-        }
-        if (cur > end || level_run <= 1) { s->error = 0x10; }
-        sym_len = (int32_t)(cur - cur_def);
-        __threadfence_block();
-      }
-      sym_len   = shuffle(sym_len);
-      level_val = shuffle(level_val);
-      level_run = shuffle(level_run);
-      cur_def += sym_len;
-    }
-    if (s->error) { break; }
-
-    batch_len = min(num_input_values - value_count, 32);
-    if (level_run & 1) {
-      // Literal run
-      int batch_len8;
-      batch_len  = min(batch_len, (level_run >> 1) * 8);
-      batch_len8 = (batch_len + 7) >> 3;
-      if (t < batch_len) {
-        int bitpos         = t * level_bits;
-        uint8_t const* cur = cur_def + (bitpos >> 3);
-        bitpos &= 7;
-        if (cur < end) level_val = cur[0];
-        cur++;
-        if (level_bits > 8 - bitpos && cur < end) {
-          level_val |= cur[0] << 8;
-          cur++;
-          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
-        }
-        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
-      }
-      level_run -= batch_len8 * 2;
-      cur_def += batch_len8 * level_bits;
-    } else {
-      // Repeated value
-      batch_len = min(batch_len, level_run >> 1);
-      level_run -= batch_len * 2;
-    }
-    if (t < batch_len) {
-      int idx                    = value_count + t;
-      output[rolling_index(idx)] = level_val;
-    }
-    batch_coded_count += batch_len;
-    value_count += batch_len;
-  }
-
-  // update the stream info
-  if (!t) {
-    s->lvl_start[lvl]         = cur_def;
-    s->initial_rle_run[lvl]   = level_run;
-    s->initial_rle_value[lvl] = level_val;
-    s->lvl_count[lvl]         = value_count;
-  }
-}
-
-/**
- * @brief Performs RLE decoding of dictionary indexes
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
- * 31)
- * @param[in] t Warp1 thread ID (0..31)
- *
- * @return A pair containing the new output position, and the total length of strings decoded (this
- * will only be valid on thread 0 and if sizes_only is true). In the event that this function
- * decodes strings beyond target_pos, the total length of strings returned will include these
- * additional values.
- */
-template <bool sizes_only>
-__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  volatile page_state_s* s,
-  [[maybe_unused]] volatile page_state_buffers_s* sb,
-  int target_pos,
-  int t)
-{
-  uint8_t const* end = s->data_end;
-  int dict_bits      = s->dict_bits;
-  int pos            = s->dict_pos;
-  int str_len        = 0;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      uint8_t const* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          int bytecnt = (dict_bits + 7) >> 3;
-          if (cur + bytecnt <= end) {
-            int32_t run_val = cur[0];
-            if (bytecnt > 1) {
-              run_val |= cur[1] << 8;
-              if (bytecnt > 2) {
-                run_val |= cur[2] << 16;
-                if (bytecnt > 3) { run_val |= cur[3] << 24; }
-              }
-            }
-            s->dict_val = run_val & ((1 << dict_bits) - 1);
-          }
-          cur += bytecnt;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8 * dict_bits;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-
-    // compute dictionary index.
-    int dict_idx = 0;
-    if (t < batch_len) {
-      dict_idx = s->dict_val;
-      if (is_literal) {
-        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
-        uint8_t const* p = s->data_start + (ofs >> 3);
-        ofs &= 7;
-        if (p < end) {
-          uint32_t c = 8 - ofs;
-          dict_idx   = (*p++) >> ofs;
-          if (c < dict_bits && p < end) {
-            dict_idx |= (*p++) << c;
-            c += 8;
-            if (c < dict_bits && p < end) {
-              dict_idx |= (*p++) << c;
-              c += 8;
-              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
-            }
-          }
-          dict_idx &= (1 << dict_bits) - 1;
-        }
-      }
-
-      // if we're not computing sizes, store off the dictionary index
-      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
-    }
-
-    // if we're computing sizes, add the length(s)
-    if constexpr (sizes_only) {
-      int const len = [&]() {
-        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
-        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-        if (dict_pos < (uint32_t)s->dict_size) {
-          const auto* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
-          return src->second;
-        }
-        return 0;
-      }();
-
-      using WarpReduce = cub::WarpReduce<size_type>;
-      __shared__ typename WarpReduce::TempStorage temp_storage;
-      // note: str_len will only be valid on thread 0.
-      str_len += WarpReduce(temp_storage).Sum(len);
-    }
-
-    pos += batch_len;
-  }
-  return {pos, str_len};
-}
-
-/**
- * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target write position
- * @param[in] t Thread ID
- *
- * @return The new output position
- */
-__device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
-                                    volatile page_state_buffers_s* sb,
-                                    int target_pos,
-                                    int t)
-{
-  uint8_t const* end = s->data_end;
-  int pos            = s->dict_pos;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      uint8_t const* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
-          cur++;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
-        if (batch_len >= 8) { batch_len &= ~7; }
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-    if (t < batch_len) {
-      int dict_idx;
-      if (is_literal) {
-        int32_t ofs      = t - ((batch_len + 7) & ~7);
-        uint8_t const* p = s->data_start + (ofs >> 3);
-        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
-      } else {
-        dict_idx = s->dict_val;
-      }
-      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
-    }
-    pos += batch_len;
-  }
-  return pos;
-}
-
-/**
- * @brief Parses the length and position of strings and returns total length of all strings
- * processed
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target output position
- * @param[in] t Thread ID
- *
- * @return Total length of strings processed
- */
-template <bool sizes_only>
-__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
-                                              [[maybe_unused]] volatile page_state_buffers_s* sb,
-                                              int target_pos,
-                                              int t)
-{
-  int pos       = s->dict_pos;
-  int total_len = 0;
-
-  // This step is purely serial
-  if (!t) {
-    uint8_t const* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len;
-      if (k + 4 <= dict_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > dict_size) { len = 0; }
-      } else {
-        len = 0;
-      }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index(pos)] = k;
-        sb->str_len[rolling_index(pos)]  = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
-    }
-    s->dict_val = k;
-    __threadfence_block();
-  }
-
-  return total_len;
-}
-
-/**
- * @brief Retrieves string information for a string at the specified source position
- *
- * @param[in] s Page state input
- * @param[out] sb Page state buffer output
- * @param[in] src_pos Source position
- *
- * @return A pair containing a pointer to the string and its length
- */
-inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos)
-{
-  char const* ptr = nullptr;
-  size_t len      = 0;
-
-  if (s->dict_base) {
-    // String dictionary
-    uint32_t dict_pos =
-      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
-    if (dict_pos < (uint32_t)s->dict_size) {
-      auto const* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
-      ptr             = src->first;
-      len             = src->second;
-    }
-  } else {
-    // Plain encoding
-    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
-    if (dict_pos <= (uint32_t)s->dict_size) {
-      ptr = reinterpret_cast<char const*>(s->data_start + dict_pos);
-      len = sb->str_len[rolling_index(src_pos)];
-    }
-  }
-
-  return {ptr, len};
-}
-
 /**
  * @brief Output a string descriptor
  *
@@ -964,669 +402,6 @@ static __device__ void gpuOutputGeneric(
 }
 
 /**
- * @brief Sets up block-local page state information from the global pages.
- *
- * @param[in, out] s The local page state to be filled in
- * @param[in] p The global page to be copied from
- * @param[in] chunks The global list of chunks
- * @param[in] min_row Crop all rows below min_row
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
- * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
- * Currently only used by gpuComputePageSizes step)
- */
-static __device__ bool setupLocalPageInfo(page_state_s* const s,
-                                          PageInfo const* p,
-                                          device_span<ColumnChunkDesc const> chunks,
-                                          size_t min_row,
-                                          size_t num_rows,
-                                          bool is_decode_step)
-{
-  int t = threadIdx.x;
-  int chunk_idx;
-
-  // Fetch page info
-  if (!t) {
-    s->page         = *p;
-    s->nesting_info = nullptr;
-  }
-  __syncthreads();
-
-  if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; }
-  // Fetch column chunk info
-  chunk_idx = s->page.chunk_idx;
-  if (!t) { s->col = chunks[chunk_idx]; }
-
-  // if we can use the nesting decode cache, set it up now
-  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
-  if (can_use_decode_cache) {
-    int depth = 0;
-    while (depth < s->page.nesting_info_size) {
-      int const thread_depth = depth + t;
-      if (thread_depth < s->page.nesting_info_size) {
-        // these values need to be copied over from global
-        s->nesting_decode_cache[thread_depth].max_def_level =
-          s->page.nesting_decode[thread_depth].max_def_level;
-        s->nesting_decode_cache[thread_depth].page_start_value =
-          s->page.nesting_decode[thread_depth].page_start_value;
-        s->nesting_decode_cache[thread_depth].start_depth =
-          s->page.nesting_decode[thread_depth].start_depth;
-        s->nesting_decode_cache[thread_depth].end_depth =
-          s->page.nesting_decode[thread_depth].end_depth;
-      }
-      depth += blockDim.x;
-    }
-  }
-  if (!t) {
-    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
-  }
-
-  __syncthreads();
-
-  // zero counts
-  int depth = 0;
-  while (depth < s->page.num_output_nesting_levels) {
-    int const thread_depth = depth + t;
-    if (thread_depth < s->page.num_output_nesting_levels) {
-      s->nesting_info[thread_depth].valid_count = 0;
-      s->nesting_info[thread_depth].value_count = 0;
-      s->nesting_info[thread_depth].null_count  = 0;
-    }
-    depth += blockDim.x;
-  }
-  __syncthreads();
-
-  if (!t) {
-    s->error = 0;
-
-    // our starting row (absolute index) is
-    // col.start_row == absolute row index
-    // page.chunk-row == relative row index within the chunk
-    size_t page_start_row = s->col.start_row + s->page.chunk_row;
-
-    // IMPORTANT : nested schemas can have 0 rows in a page but still have
-    // values. The case is:
-    // - On page N-1, the last row starts, with 2/6 values encoded
-    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
-    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
-    if (s->page.num_input_values > 0) {
-      uint8_t* cur = s->page.page_data;
-      uint8_t* end = cur + s->page.uncompressed_page_size;
-
-      uint32_t dtype_len_out = s->col.data_type >> 3;
-      s->ts_scale            = 0;
-      // Validate data type
-      auto const data_type = s->col.data_type & 7;
-      switch (data_type) {
-        case BOOLEAN:
-          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
-          break;
-        case INT32: [[fallthrough]];
-        case FLOAT: s->dtype_len = 4; break;
-        case INT64:
-          if (s->col.ts_clock_rate) {
-            int32_t units = 0;
-            // Duration types are not included because no scaling is done when reading
-            if (s->col.converted_type == TIMESTAMP_MILLIS) {
-              units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
-              units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
-              units = cudf::timestamp_ns::period::den;
-            }
-            if (units and units != s->col.ts_clock_rate) {
-              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
-                                                           : (s->col.ts_clock_rate / units);
-            }
-          }
-          [[fallthrough]];
-        case DOUBLE: s->dtype_len = 8; break;
-        case INT96: s->dtype_len = 12; break;
-        case BYTE_ARRAY:
-          if (s->col.converted_type == DECIMAL) {
-            auto const decimal_precision = s->col.decimal_precision;
-            s->dtype_len                 = [decimal_precision]() {
-              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
-                return sizeof(int32_t);
-              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
-                return sizeof(int64_t);
-              } else {
-                return sizeof(__int128_t);
-              }
-            }();
-          } else {
-            s->dtype_len = sizeof(string_index_pair);
-          }
-          break;
-        default:  // FIXED_LEN_BYTE_ARRAY:
-          s->dtype_len = dtype_len_out;
-          s->error |= (s->dtype_len <= 0);
-          break;
-      }
-      // Special check for downconversions
-      s->dtype_len_in = s->dtype_len;
-      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
-        s->dtype_len = [dtype_len = s->dtype_len]() {
-          if (dtype_len <= sizeof(int32_t)) {
-            return sizeof(int32_t);
-          } else if (dtype_len <= sizeof(int64_t)) {
-            return sizeof(int64_t);
-          } else {
-            return sizeof(__int128_t);
-          }
-        }();
-      } else if (data_type == INT32) {
-        if (dtype_len_out == 1) {
-          // INT8 output
-          s->dtype_len = 1;
-        } else if (dtype_len_out == 2) {
-          // INT16 output
-          s->dtype_len = 2;
-        } else if (s->col.converted_type == TIME_MILLIS) {
-          // INT64 output
-          s->dtype_len = 8;
-        }
-      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
-        s->dtype_len = 4;  // HASH32 output
-      } else if (data_type == INT96) {
-        s->dtype_len = 8;  // Convert to 64-bit timestamp
-      }
-
-      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
-      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
-      // because we ignore these values in that stage.
-      {
-        auto const max_row = min_row + num_rows;
-
-        // if we are totally outside the range of the input, do nothing
-        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
-          s->first_row = 0;
-          s->num_rows  = 0;
-        }
-        // otherwise
-        else {
-          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
-          auto const max_page_rows = s->page.num_rows - s->first_row;
-          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
-                                       ? max_page_rows
-                                       : max_row - (page_start_row + s->first_row);
-        }
-      }
-
-      // during the decoding step we need to offset the global output buffers
-      // for each level of nesting so that we write to the section this page
-      // is responsible for.
-      // - for flat schemas, we can do this directly by using row counts
-      // - for nested schemas, these offsets are computed during the preprocess step
-      //
-      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
-      // will be aliased to memory that has been freed when we get here in the non-decode step, so
-      // we cannot check against nullptr.  we'll just check a flag directly.
-      if (is_decode_step) {
-        int max_depth = s->col.max_nesting_depth;
-        for (int idx = 0; idx < max_depth; idx++) {
-          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
-
-          size_t output_offset;
-          // schemas without lists
-          if (s->col.max_level[level_type::REPETITION] == 0) {
-            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
-          }
-          // for schemas with lists, we've already got the exact value precomputed
-          else {
-            output_offset = nesting_info->page_start_value;
-          }
-
-          nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-
-          if (nesting_info->data_out != nullptr) {
-            // anything below max depth with a valid data pointer must be a list, so the
-            // element size is the size of the offset type.
-            uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
-            nesting_info->data_out += (output_offset * len);
-          }
-          nesting_info->valid_map = s->col.valid_map_base[idx];
-          if (nesting_info->valid_map != nullptr) {
-            nesting_info->valid_map += output_offset >> 5;
-            nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
-          }
-        }
-      }
-      s->first_output_value = 0;
-
-      // Find the compressed size of repetition levels
-      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
-      // Find the compressed size of definition levels
-      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
-
-      s->dict_bits = 0;
-      s->dict_base = nullptr;
-      s->dict_size = 0;
-      // NOTE:  if additional encodings are supported in the future, modifications must
-      // be made to is_supported_encoding() in reader_impl_preprocess.cu
-      switch (s->page.encoding) {
-        case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
-          // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
-            // String dictionary: use index
-            s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
-          } else {
-            s->dict_base =
-              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
-            s->dict_size = s->col.page_info[0].uncompressed_page_size;
-          }
-          s->dict_run  = 0;
-          s->dict_val  = 0;
-          s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
-          break;
-        case Encoding::PLAIN:
-          s->dict_size = static_cast<int32_t>(end - cur);
-          s->dict_val  = 0;
-          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
-          break;
-        case Encoding::RLE: s->dict_run = 0; break;
-        default:
-          s->error = 1;  // Unsupported encoding
-          break;
-      }
-      if (cur > end) { s->error = 1; }
-      s->lvl_end    = cur;
-      s->data_start = cur;
-      s->data_end   = end;
-    } else {
-      s->error = 1;
-    }
-
-    s->lvl_count[level_type::REPETITION] = 0;
-    s->lvl_count[level_type::DEFINITION] = 0;
-    s->nz_count                          = 0;
-    s->num_input_values                  = s->page.num_input_values;
-    s->dict_pos                          = 0;
-    s->src_pos                           = 0;
-
-    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
-    // preprocess of the definition levels (since nulls will have no actual decodable value, there
-    // is no direct correlation between # of rows and # of decodable values).  so we will start
-    // processing at the beginning of the value stream and disregard any indices that start
-    // before the first row.
-    if (s->col.max_level[level_type::REPETITION] == 0) {
-      s->page.skipped_values      = 0;
-      s->page.skipped_leaf_values = 0;
-      s->input_value_count        = 0;
-      s->input_row_count          = 0;
-      s->input_leaf_count         = 0;
-
-      s->row_index_lower_bound = -1;
-    }
-    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
-    // we need to start decoding at
-    else {
-      // input_row_count translates to "how many rows we have processed so far", so since we are
-      // skipping directly to where we want to start decoding, set it to first_row
-      s->input_row_count = s->first_row;
-
-      // return the lower bound to compare (page-relative) thread row index against. Explanation:
-      // In the case of nested schemas, rows can span page boundaries.  That is to say,
-      // we can encounter the first value for row X on page M, but the last value for page M
-      // might not be the last value for row X. page M+1 (or further) may contain the last value.
-      //
-      // This means that the first values we encounter for a given page (M+1) may not belong to the
-      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
-      // previous row is within the overall row bounds, include the values by allowing relative row
-      // index -1
-      int const max_row = (min_row + num_rows) - 1;
-      if (min_row < page_start_row && max_row >= page_start_row - 1) {
-        s->row_index_lower_bound = -1;
-      } else {
-        s->row_index_lower_bound = s->first_row;
-      }
-
-      // if we're in the decoding step, jump directly to the first
-      // value we care about
-      if (is_decode_step) {
-        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
-      } else {
-        s->input_value_count = 0;
-        s->input_leaf_count  = 0;
-        s->page.skipped_values =
-          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
-        s->page.skipped_leaf_values = 0;
-      }
-    }
-
-    __threadfence_block();
-  }
-  __syncthreads();
-
-  return true;
-}
-
-/**
- * @brief Store a validity mask containing value_count bits into the output validity buffer of the
- * page.
- *
- * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
- * offset is also updated
- * @param[in] valid_mask The validity mask to be stored
- * @param[in] value_count # of bits in the validity mask
- */
-static __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
-                                      uint32_t valid_mask,
-                                      int32_t value_count)
-{
-  int word_offset = nesting_info->valid_map_offset / 32;
-  int bit_offset  = nesting_info->valid_map_offset % 32;
-  // if we fit entirely in the output word
-  if (bit_offset + value_count <= 32) {
-    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
-
-    if (relevant_mask == ~0) {
-      nesting_info->valid_map[word_offset] = valid_mask;
-    } else {
-      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
-    }
-  }
-  // we're going to spill over into the next word.
-  // note : writing both values here is the lazy/slow way.  we could be writing just
-  // the first word and rolling the remaining bits over into the next call.
-  // however, some basic performance tests shows almost no difference between these two
-  // methods. More detailed performance testing might be worthwhile here.
-  else {
-    uint32_t bits_left = 32 - bit_offset;
-
-    // first word. strip bits_left bits off the beginning and store that
-    uint32_t relevant_mask = ((1 << bits_left) - 1);
-    uint32_t mask_word0    = valid_mask & relevant_mask;
-    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
-
-    // second word. strip the remainder of the bits off the end and store that
-    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
-    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
-    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
-    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
-  }
-
-  nesting_info->valid_map_offset += value_count;
-}
-
-/**
- * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
- * D to which we should considered them null or not.
- *
- * @param[out] start_depth The start nesting depth
- * @param[out] end_depth The end nesting depth (inclusive)
- * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
- * d will be -1
- * @param[in] s Local page information
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] input_value_count The current count of input level values we have processed
- * @param[in] target_input_value_count The desired # of input level values we want to process
- * @param[in] t Thread index
- */
-template <int lvl_buf_size, typename level_t>
-inline __device__ void get_nesting_bounds(int& start_depth,
-                                          int& end_depth,
-                                          int& d,
-                                          page_state_s* s,
-                                          level_t const* const rep,
-                                          level_t const* const def,
-                                          int input_value_count,
-                                          int32_t target_input_value_count,
-                                          int t)
-{
-  start_depth = -1;
-  end_depth   = -1;
-  d           = -1;
-  if (input_value_count + t < target_input_value_count) {
-    int const index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
-    d               = static_cast<int>(def[index]);
-    // if we have repetition (there are list columns involved) we have to
-    // bound what nesting levels we apply values to
-    if (s->col.max_level[level_type::REPETITION] > 0) {
-      level_t const r = rep[index];
-      start_depth     = s->nesting_info[r].start_depth;
-      end_depth       = s->nesting_info[d].end_depth;
-    }
-    // for columns without repetition (even ones involving structs) we always
-    // traverse the entire hierarchy.
-    else {
-      start_depth = 0;
-      end_depth   = s->col.max_nesting_depth - 1;
-    }
-  }
-}
-
-/**
- * @brief Process a batch of incoming repetition/definition level values and generate
- *        validity, nested column offsets (where appropriate) and decoding indices.
- *
- * @param[in] target_input_value_count The # of repetition/definition levels to process up to
- * @param[in] s Local page information
- * @param[out] sb Page state buffer output
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] t Thread index
- */
-template <int lvl_buf_size, typename level_t>
-static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
-                                                             page_state_s* s,
-                                                             page_state_buffers_s* sb,
-                                                             level_t const* const rep,
-                                                             level_t const* const def,
-                                                             int t)
-{
-  // max nesting depth of the column
-  int const max_depth       = s->col.max_nesting_depth;
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-  // how many (input) values we've processed in the page so far
-  int input_value_count = s->input_value_count;
-  // how many rows we've processed in the page so far
-  int input_row_count = s->input_row_count;
-
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-
-  // process until we've reached the target
-  while (input_value_count < target_input_value_count) {
-    // determine the nesting bounds for this thread (the range of nesting depths we
-    // will generate new value indices and validity bits for)
-    int start_depth, end_depth, d;
-    get_nesting_bounds<non_zero_buffer_size, level_t>(
-      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
-
-    // 4 interesting things to track:
-    // thread_value_count : # of output values from the view of this thread
-    // warp_value_count   : # of output values for the whole warp
-    //
-    // thread_valid_count : # of valid values from the view of this thread
-    // warp_valid_count   : # of valid values for the whole warp
-    uint32_t thread_value_count, warp_value_count;
-    uint32_t thread_valid_count, warp_valid_count;
-
-    // track (page-relative) row index for the thread so we can compare against input bounds
-    // keep track of overall # of rows we've read.
-    int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
-    int32_t const thread_row_index =
-      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
-    input_row_count += __popc(warp_row_count_mask);
-    // is this thread within read row bounds?
-    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                                  thread_row_index < (s->first_row + s->num_rows)
-                                ? 1
-                                : 0;
-
-    // compute warp and thread value counts
-    uint32_t const warp_count_mask =
-      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
-
-    warp_value_count = __popc(warp_count_mask);
-    // Note : ((1 << t) - 1) implies "for all threads before me"
-    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
-
-    // walk from 0 to max_depth
-    uint32_t next_thread_value_count, next_warp_value_count;
-    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
-
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
-
-      // everything up to the max_def_level is a non-null value
-      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
-
-      // compute warp and thread valid counts
-      uint32_t const warp_valid_mask =
-        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
-        // because every value in the input matches to a value in the output
-        !has_repetition
-          ? ballot(is_valid)
-          :
-          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
-          // however not all of them will necessarily represent a value at this nesting level. so
-          // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
-          // __reduce_or_sync(), but until then we have to do a warp reduce.
-          WarpReduceOr32(is_valid << thread_value_count);
-
-      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
-      warp_valid_count   = __popc(warp_valid_mask);
-
-      // if this is the value column emit an index for value decoding
-      if (is_valid && s_idx == max_depth - 1) {
-        int const src_pos = nesting_info->valid_count + thread_valid_count;
-        int const dst_pos = nesting_info->value_count + thread_value_count;
-        // nz_idx is a mapping of src buffer indices to destination buffer indices
-        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
-      }
-
-      // compute warp and thread value counts for the -next- nesting level. we need to
-      // do this for nested schemas so that we can emit an offset for the -current- nesting
-      // level. more concretely : the offset for the current nesting level == current length of the
-      // next nesting level
-      if (s_idx < max_depth - 1) {
-        uint32_t const next_warp_count_mask =
-          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
-        next_warp_value_count   = __popc(next_warp_count_mask);
-        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
-
-        // if we're -not- at a leaf column and we're within nesting/row bounds
-        // and we have a valid data_out pointer, it implies this is a list column, so
-        // emit an offset.
-        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
-          int const idx             = nesting_info->value_count + thread_value_count;
-          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
-                                      next_thread_value_count +
-                                      nesting_info_base[s_idx + 1].page_start_value;
-          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
-        }
-      }
-
-      // nested schemas always read and write to the same bounds (that is, read and write positions
-      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
-      // first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector here
-      // we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds =
-        !has_repetition
-          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
-          : in_row_bounds;
-      int const first_thread_in_write_range =
-        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
-
-      // # of bits to of the validity mask to write out
-      int const warp_valid_mask_bit_count =
-        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
-
-      // increment count of valid values, count of total values, and update validity mask
-      if (!t) {
-        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
-          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
-
-          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
-        }
-        nesting_info->valid_count += warp_valid_count;
-        nesting_info->value_count += warp_value_count;
-      }
-
-      // propagate value counts for the next level
-      warp_value_count   = next_warp_value_count;
-      thread_value_count = next_thread_value_count;
-    }
-
-    input_value_count += min(32, (target_input_value_count - input_value_count));
-    __syncwarp();
-  }
-
-  // update
-  if (!t) {
-    // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
-    s->input_value_count = input_value_count;
-    s->input_row_count   = input_row_count;
-  }
-}
-
-/**
- * @brief Process repetition and definition levels up to the target count of leaf values.
- *
- * In order to decode actual leaf values from the input stream, we need to generate the
- * list of non-null value positions (page_state_s::nz_idx). We do this by processing
- * the repetition and definition level streams.  This process also generates validity information,
- * and offset column values in the case of nested schemas. Because of the way the streams
- * are encoded, this function may generate slightly more than target_leaf_count.
- *
- * Only runs on 1 warp.
- *
- * @param[in] s The local page state
- * @param[out] sb Page state buffer output
- * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
- * @param[in] rep Repetition level buffer
- * @param[in] def Definition level buffer
- * @param[in] t Thread index
- */
-template <int lvl_buf_size, typename level_t>
-__device__ void gpuDecodeLevels(page_state_s* s,
-                                page_state_buffers_s* sb,
-                                int32_t target_leaf_count,
-                                level_t* const rep,
-                                level_t* const def,
-                                int t)
-{
-  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  constexpr int batch_size = 32;
-  int cur_leaf_count       = target_leaf_count;
-  while (!s->error && s->nz_count < target_leaf_count &&
-         s->input_value_count < s->num_input_values) {
-    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
-    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
-    __syncwarp();
-
-    // because the rep and def streams are encoded separately, we cannot request an exact
-    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
-    // levels we get.
-    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
-                                                 s->lvl_count[level_type::DEFINITION])
-                                           : s->lvl_count[level_type::DEFINITION];
-
-    // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size, level_t>(
-      actual_leaf_count, s, sb, rep, def, t);
-    cur_leaf_count = actual_leaf_count + batch_size;
-    __syncwarp();
-  }
-}
-
-/**
- * @brief Returns the total size in bytes of string char data in the page.
  *
  * This function expects the dictionary position to be at 0 and will traverse
  * the entire thing.
@@ -1817,7 +592,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
   rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; }
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   int const max_batch_size = lvl_buf_size;
@@ -1880,7 +655,7 @@ __global__ void __launch_bounds__(preprocess_block_size)
 
   // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
   // at the beginning or the end of the row bounds)
-  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows)) {
+  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
     int depth = 0;
     while (depth < s->page.num_output_nesting_levels) {
       auto const thread_depth = depth + t;
@@ -1961,28 +736,8 @@ __global__ void __launch_bounds__(preprocess_block_size)
   }
 }
 
-// Copies null counts back to `nesting_decode` at the end of scope
-struct null_count_back_copier {
-  page_state_s* s;
-  int t;
-  __device__ ~null_count_back_copier()
-  {
-    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
-      int depth = 0;
-      while (depth < s->page.num_output_nesting_levels) {
-        int const thread_depth = depth + t;
-        if (thread_depth < s->page.num_output_nesting_levels) {
-          s->page.nesting_decode[thread_depth].null_count =
-            s->nesting_decode_cache[thread_depth].null_count;
-        }
-        depth += blockDim.x;
-      }
-    }
-  }
-};
-
 /**
- * @brief Kernel for co the column data stored in the pages
+ * @brief Kernel for computing the column data stored in the pages
  *
  * This function will write the page data and the page data's validity to the
  * output specified in the page's column chunk. If necessary, additional
@@ -2008,25 +763,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
-
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) {
     return;
   }
 
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
@@ -2074,7 +817,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
-      int dtype = s->col.data_type & 7;
+      int const dtype = s->col.data_type & 7;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
new file mode 100644
index 00000000000..4469ec59b7a
--- /dev/null
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -0,0 +1,1327 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+#include "rle_stream.cuh"
+
+#include <io/utilities/block_utils.cuh>
+
+#include <cuda/std/tuple>
+
+namespace cudf::io::parquet::gpu {
+
+constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
+constexpr int decode_block_size     = 128;
+constexpr int non_zero_buffer_size  = decode_block_size * 2;
+
+constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
+template <int lvl_buf_size>
+constexpr int rolling_lvl_index(int index)
+{
+  return index % lvl_buf_size;
+}
+
+struct page_state_s {
+  uint8_t const* data_start;
+  uint8_t const* data_end;
+  uint8_t const* lvl_end;
+  uint8_t const* dict_base;    // ptr to dictionary page data
+  int32_t dict_size;           // size of dictionary data
+  int32_t first_row;           // First row in page to output
+  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value;  // First value in page to output
+  int32_t num_input_values;    // total # of input/level values in the page
+  int32_t dtype_len;           // Output data type length
+  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits;           // # of bits to store dictionary indices
+  uint32_t dict_run;
+  int32_t dict_val;
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
+  int32_t error;
+  PageInfo page;
+  ColumnChunkDesc col;
+
+  // (leaf) value decoding
+  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos;  // write position of dictionary indices
+  int32_t src_pos;   // input read position of final output value
+  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+
+  // repetition/definition level decoding
+  int32_t input_value_count;                  // how many values of the input we've processed
+  int32_t input_row_count;                    // how many rows of the input we've processed
+  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+
+  // a shared-memory cache of frequently used data when decoding. The source of this data is
+  // normally stored in global memory which can yield poor performance. So, when possible
+  // we copy that info here prior to decoding
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  // points to either nesting_decode_cache above when possible, or to the global source otherwise
+  PageNestingDecodeInfo* nesting_info;
+};
+
+// buffers only used in the decode kernel.  separated from page_state_s to keep
+// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
+struct page_state_buffers_s {
+  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
+  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
+  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
+};
+
+// Copies null counts back to `nesting_decode` at the end of scope
+struct null_count_back_copier {
+  page_state_s* s;
+  int t;
+  __device__ ~null_count_back_copier()
+  {
+    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
+      int depth = 0;
+      while (depth < s->page.num_output_nesting_levels) {
+        int const thread_depth = depth + t;
+        if (thread_depth < s->page.num_output_nesting_levels) {
+          s->page.nesting_decode[thread_depth].null_count =
+            s->nesting_decode_cache[thread_depth].null_count;
+        }
+        depth += blockDim.x;
+      }
+    }
+  }
+};
+
+/**
+ * @brief Test if the given page is in a string column
+ */
+constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+{
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  auto const& col = chunks[page.chunk_idx];
+  return is_string_col(col);
+}
+
+/**
+ * @brief Returns whether or not a page spans either the beginning or the end of the
+ * specified row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ * @param has_repetition True if the schema has nesting
+ *
+ * @return True if the page spans the beginning or the end of the row bounds
+ */
+inline __device__ bool is_bounds_page(page_state_s* const s,
+                                      size_t start_row,
+                                      size_t num_rows,
+                                      bool has_repetition)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  // for non-nested schemas, rows cannot span pages, so use a more restrictive test
+  return has_repetition
+           ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end))
+           : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
+}
+
+/**
+ * @brief Returns whether or not a page is completely contained within the specified
+ * row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ *
+ * @return True if the page is completely contained within the row bounds
+ */
+inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  return page_begin >= begin && page_end <= end;
+}
+
+/**
+ * @brief Retrieves string information for a string at the specified source position
+ *
+ * @param[in] s Page state input
+ * @param[out] sb Page state buffer output
+ * @param[in] src_pos Source position
+ *
+ * @return A pair containing a pointer to the string and its length
+ */
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
+  page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos)
+{
+  char const* ptr = nullptr;
+  size_t len      = 0;
+
+  if (s->dict_base) {
+    // String dictionary
+    uint32_t dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
+    if (dict_pos < (uint32_t)s->dict_size) {
+      auto const* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
+      ptr             = src->first;
+      len             = src->second;
+    }
+  } else {
+    // Plain encoding
+    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
+    if (dict_pos <= (uint32_t)s->dict_size) {
+      ptr = reinterpret_cast<char const*>(s->data_start + dict_pos);
+      len = sb->str_len[rolling_index(src_pos)];
+    }
+  }
+
+  return {ptr, len};
+}
+
+/**
+ * @brief Performs RLE decoding of dictionary indexes
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
+ * 31)
+ * @param[in] t Warp1 thread ID (0..31)
+ *
+ * @return A pair containing the new output position, and the total length of strings decoded (this
+ * will only be valid on thread 0 and if sizes_only is true). In the event that this function
+ * decodes strings beyond target_pos, the total length of strings returned will include these
+ * additional values.
+ */
+template <bool sizes_only>
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
+  page_state_s volatile* s,
+  [[maybe_unused]] page_state_buffers_s volatile* sb,
+  int target_pos,
+  int t)
+{
+  uint8_t const* end = s->data_end;
+  int dict_bits      = s->dict_bits;
+  int pos            = s->dict_pos;
+  int str_len        = 0;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      uint8_t const* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          int bytecnt = (dict_bits + 7) >> 3;
+          if (cur + bytecnt <= end) {
+            int32_t run_val = cur[0];
+            if (bytecnt > 1) {
+              run_val |= cur[1] << 8;
+              if (bytecnt > 2) {
+                run_val |= cur[2] << 16;
+                if (bytecnt > 3) { run_val |= cur[3] << 24; }
+              }
+            }
+            s->dict_val = run_val & ((1 << dict_bits) - 1);
+          }
+          cur += bytecnt;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8 * dict_bits;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+
+    // compute dictionary index.
+    int dict_idx = 0;
+    if (t < batch_len) {
+      dict_idx = s->dict_val;
+      if (is_literal) {
+        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
+        uint8_t const* p = s->data_start + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+      }
+
+      // if we're not computing sizes, store off the dictionary index
+      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
+    }
+
+    // if we're computing sizes, add the length(s)
+    if constexpr (sizes_only) {
+      int const len = [&]() {
+        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
+        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+        if (dict_pos < (uint32_t)s->dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+          return src->second;
+        }
+        return 0;
+      }();
+
+      using WarpReduce = cub::WarpReduce<size_type>;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      // note: str_len will only be valid on thread 0.
+      str_len += WarpReduce(temp_storage).Sum(len);
+    }
+
+    pos += batch_len;
+  }
+  return {pos, str_len};
+}
+
+/**
+ * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target write position
+ * @param[in] t Thread ID
+ *
+ * @return The new output position
+ */
+inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
+                                           page_state_buffers_s volatile* sb,
+                                           int target_pos,
+                                           int t)
+{
+  uint8_t const* end = s->data_end;
+  int pos            = s->dict_pos;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      uint8_t const* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
+          cur++;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
+        if (batch_len >= 8) { batch_len &= ~7; }
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+    if (t < batch_len) {
+      int dict_idx;
+      if (is_literal) {
+        int32_t ofs      = t - ((batch_len + 7) & ~7);
+        uint8_t const* p = s->data_start + (ofs >> 3);
+        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
+      } else {
+        dict_idx = s->dict_val;
+      }
+      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
+    }
+    pos += batch_len;
+  }
+  return pos;
+}
+
+/**
+ * @brief Parses the length and position of strings and returns total length of all strings
+ * processed
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target output position
+ * @param[in] t Thread ID
+ *
+ * @return Total length of strings processed
+ */
+template <bool sizes_only>
+__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
+                                              [[maybe_unused]] page_state_buffers_s volatile* sb,
+                                              int target_pos,
+                                              int t)
+{
+  int pos       = s->dict_pos;
+  int total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    uint8_t const* cur = s->data_start;
+    int dict_size      = s->dict_size;
+    int k              = s->dict_val;
+
+    while (pos < target_pos) {
+      int len;
+      if (k + 4 <= dict_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > dict_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+      if constexpr (!sizes_only) {
+        sb->dict_idx[rolling_index(pos)] = k;
+        sb->str_len[rolling_index(pos)]  = len;
+      }
+      k += len;
+      total_len += len;
+      pos++;
+    }
+    s->dict_val = k;
+    __threadfence_block();
+  }
+
+  return total_len;
+}
+
+/**
+ * @brief Decode values out of a definition or repetition stream
+ *
+ * @param[in,out] s Page state input/output
+ * @param[in] t target_count Target count of stream values on output
+ * @param[in] t Warp0 thread ID (0..31)
+ * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
+ */
+template <typename level_t>
+__device__ void gpuDecodeStream(
+  level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
+{
+  uint8_t const* cur_def    = s->lvl_start[lvl];
+  uint8_t const* end        = s->lvl_end;
+  uint32_t level_run        = s->initial_rle_run[lvl];
+  int32_t level_val         = s->initial_rle_value[lvl];
+  int level_bits            = s->col.level_bits[lvl];
+  int32_t num_input_values  = s->num_input_values;
+  int32_t value_count       = s->lvl_count[lvl];
+  int32_t batch_coded_count = 0;
+
+  while (value_count < target_count && value_count < num_input_values) {
+    int batch_len;
+    if (level_run <= 1) {
+      // Get a new run symbol from the byte stream
+      int sym_len = 0;
+      if (!t) {
+        uint8_t const* cur = cur_def;
+        if (cur < end) { level_run = get_vlq32(cur, end); }
+        if (!(level_run & 1)) {
+          if (cur < end) level_val = cur[0];
+          cur++;
+          if (level_bits > 8) {
+            if (cur < end) level_val |= cur[0] << 8;
+            cur++;
+          }
+        }
+        if (cur > end || level_run <= 1) { s->error = 0x10; }
+        sym_len = (int32_t)(cur - cur_def);
+        __threadfence_block();
+      }
+      sym_len   = shuffle(sym_len);
+      level_val = shuffle(level_val);
+      level_run = shuffle(level_run);
+      cur_def += sym_len;
+    }
+    if (s->error) { break; }
+
+    batch_len = min(num_input_values - value_count, 32);
+    if (level_run & 1) {
+      // Literal run
+      int batch_len8;
+      batch_len  = min(batch_len, (level_run >> 1) * 8);
+      batch_len8 = (batch_len + 7) >> 3;
+      if (t < batch_len) {
+        int bitpos         = t * level_bits;
+        uint8_t const* cur = cur_def + (bitpos >> 3);
+        bitpos &= 7;
+        if (cur < end) level_val = cur[0];
+        cur++;
+        if (level_bits > 8 - bitpos && cur < end) {
+          level_val |= cur[0] << 8;
+          cur++;
+          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
+        }
+        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
+      }
+      level_run -= batch_len8 * 2;
+      cur_def += batch_len8 * level_bits;
+    } else {
+      // Repeated value
+      batch_len = min(batch_len, level_run >> 1);
+      level_run -= batch_len * 2;
+    }
+    if (t < batch_len) {
+      int idx                    = value_count + t;
+      output[rolling_index(idx)] = level_val;
+    }
+    batch_coded_count += batch_len;
+    value_count += batch_len;
+  }
+
+  // update the stream info
+  if (!t) {
+    s->lvl_start[lvl]         = cur_def;
+    s->initial_rle_run[lvl]   = level_run;
+    s->initial_rle_value[lvl] = level_val;
+    s->lvl_count[lvl]         = value_count;
+  }
+}
+
+/**
+ * @brief Store a validity mask containing value_count bits into the output validity buffer of the
+ * page.
+ *
+ * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
+ * offset is also updated
+ * @param[in] valid_mask The validity mask to be stored
+ * @param[in] value_count # of bits in the validity mask
+ */
+inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
+                                      uint32_t valid_mask,
+                                      int32_t value_count)
+{
+  int word_offset = nesting_info->valid_map_offset / 32;
+  int bit_offset  = nesting_info->valid_map_offset % 32;
+  // if we fit entirely in the output word
+  if (bit_offset + value_count <= 32) {
+    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
+
+    if (relevant_mask == ~0) {
+      nesting_info->valid_map[word_offset] = valid_mask;
+    } else {
+      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
+      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
+    }
+  }
+  // we're going to spill over into the next word.
+  // note : writing both values here is the lazy/slow way.  we could be writing just
+  // the first word and rolling the remaining bits over into the next call.
+  // however, some basic performance tests shows almost no difference between these two
+  // methods. More detailed performance testing might be worthwhile here.
+  else {
+    uint32_t bits_left = 32 - bit_offset;
+
+    // first word. strip bits_left bits off the beginning and store that
+    uint32_t relevant_mask = ((1 << bits_left) - 1);
+    uint32_t mask_word0    = valid_mask & relevant_mask;
+    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
+    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
+
+    // second word. strip the remainder of the bits off the end and store that
+    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
+    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
+    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
+    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
+  }
+
+  nesting_info->valid_map_offset += value_count;
+}
+
+/**
+ * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
+ * D to which we should considered them null or not.
+ *
+ * @param[out] start_depth The start nesting depth
+ * @param[out] end_depth The end nesting depth (inclusive)
+ * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
+ * d will be -1
+ * @param[in] s Local page information
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] input_value_count The current count of input level values we have processed
+ * @param[in] target_input_value_count The desired # of input level values we want to process
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size, typename level_t>
+inline __device__ void get_nesting_bounds(int& start_depth,
+                                          int& end_depth,
+                                          int& d,
+                                          page_state_s* s,
+                                          level_t const* const rep,
+                                          level_t const* const def,
+                                          int input_value_count,
+                                          int32_t target_input_value_count,
+                                          int t)
+{
+  start_depth = -1;
+  end_depth   = -1;
+  d           = -1;
+  if (input_value_count + t < target_input_value_count) {
+    int const index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    d               = static_cast<int>(def[index]);
+    // if we have repetition (there are list columns involved) we have to
+    // bound what nesting levels we apply values to
+    if (s->col.max_level[level_type::REPETITION] > 0) {
+      int r       = rep[index];
+      start_depth = s->nesting_info[r].start_depth;
+      end_depth   = s->nesting_info[d].end_depth;
+    }
+    // for columns without repetition (even ones involving structs) we always
+    // traverse the entire hierarchy.
+    else {
+      start_depth = 0;
+      end_depth   = s->col.max_nesting_depth - 1;
+    }
+  }
+}
+
+/**
+ * @brief Process a batch of incoming repetition/definition level values and generate
+ *        validity, nested column offsets (where appropriate) and decoding indices.
+ *
+ * @param[in] target_input_value_count The # of repetition/definition levels to process up to
+ * @param[in] s Local page information
+ * @param[out] sb Page state buffer output
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size, typename level_t>
+__device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
+                                                      page_state_s* s,
+                                                      page_state_buffers_s* sb,
+                                                      level_t const* const rep,
+                                                      level_t const* const def,
+                                                      int t)
+{
+  // max nesting depth of the column
+  int const max_depth       = s->col.max_nesting_depth;
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  // how many (input) values we've processed in the page so far
+  int input_value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  // process until we've reached the target
+  while (input_value_count < target_input_value_count) {
+    // determine the nesting bounds for this thread (the range of nesting depths we
+    // will generate new value indices and validity bits for)
+    int start_depth, end_depth, d;
+    get_nesting_bounds<non_zero_buffer_size, level_t>(
+      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
+
+    // 4 interesting things to track:
+    // thread_value_count : # of output values from the view of this thread
+    // warp_value_count   : # of output values for the whole warp
+    //
+    // thread_valid_count : # of valid values from the view of this thread
+    // warp_valid_count   : # of valid values for the whole warp
+    uint32_t thread_value_count, warp_value_count;
+    uint32_t thread_valid_count, warp_valid_count;
+
+    // track (page-relative) row index for the thread so we can compare against input bounds
+    // keep track of overall # of rows we've read.
+    int const is_new_row               = start_depth == 0 ? 1 : 0;
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    int32_t const thread_row_index =
+      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
+    input_row_count += __popc(warp_row_count_mask);
+    // is this thread within read row bounds?
+    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
+                                  thread_row_index < (s->first_row + s->num_rows)
+                                ? 1
+                                : 0;
+
+    // compute warp and thread value counts
+    uint32_t const warp_count_mask =
+      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
+
+    warp_value_count = __popc(warp_count_mask);
+    // Note : ((1 << t) - 1) implies "for all threads before me"
+    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
+
+    // walk from 0 to max_depth
+    uint32_t next_thread_value_count, next_warp_value_count;
+    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
+      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
+
+      // if we are within the range of nesting levels we should be adding value indices for
+      int const in_nesting_bounds =
+        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
+
+      // everything up to the max_def_level is a non-null value
+      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
+
+      // compute warp and thread valid counts
+      uint32_t const warp_valid_mask =
+        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
+        // because every value in the input matches to a value in the output
+        !has_repetition
+          ? ballot(is_valid)
+          :
+          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
+          // however not all of them will necessarily represent a value at this nesting level. so
+          // the validity bit for thread t might actually represent output value t-6. the correct
+          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // __reduce_or_sync(), but until then we have to do a warp reduce.
+          WarpReduceOr32(is_valid << thread_value_count);
+
+      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
+      warp_valid_count   = __popc(warp_valid_mask);
+
+      // if this is the value column emit an index for value decoding
+      if (is_valid && s_idx == max_depth - 1) {
+        int const src_pos = nesting_info->valid_count + thread_valid_count;
+        int const dst_pos = nesting_info->value_count + thread_value_count;
+        // nz_idx is a mapping of src buffer indices to destination buffer indices
+        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for nested schemas so that we can emit an offset for the -current- nesting
+      // level. more concretely : the offset for the current nesting level == current length of the
+      // next nesting level
+      if (s_idx < max_depth - 1) {
+        uint32_t const next_warp_count_mask =
+          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
+        next_warp_value_count   = __popc(next_warp_count_mask);
+        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
+
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
+          int const idx             = nesting_info->value_count + thread_value_count;
+          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
+                                      next_thread_value_count +
+                                      nesting_info_base[s_idx + 1].page_start_value;
+          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
+        }
+      }
+
+      // nested schemas always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
+      // first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector here
+      // we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds =
+        !has_repetition
+          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
+          : in_row_bounds;
+      int const first_thread_in_write_range =
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+
+      // # of bits to of the validity mask to write out
+      int const warp_valid_mask_bit_count =
+        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+
+      // increment count of valid values, count of total values, and update validity mask
+      if (!t) {
+        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
+          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
+          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
+
+          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
+        }
+        nesting_info->valid_count += warp_valid_count;
+        nesting_info->value_count += warp_value_count;
+      }
+
+      // propagate value counts for the next level
+      warp_value_count   = next_warp_value_count;
+      thread_value_count = next_thread_value_count;
+    }
+
+    input_value_count += min(32, (target_input_value_count - input_value_count));
+    __syncwarp();
+  }
+
+  // update
+  if (!t) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
+    s->input_value_count = input_value_count;
+    s->input_row_count   = input_row_count;
+  }
+}
+
+/**
+ * @brief Process repetition and definition levels up to the target count of leaf values.
+ *
+ * In order to decode actual leaf values from the input stream, we need to generate the
+ * list of non-null value positions (page_state_s::nz_idx). We do this by processing
+ * the repetition and definition level streams.  This process also generates validity information,
+ * and offset column values in the case of nested schemas. Because of the way the streams
+ * are encoded, this function may generate slightly more than target_leaf_count.
+ *
+ * Only runs on 1 warp.
+ *
+ * @param[in] s The local page state
+ * @param[out] sb Page state buffer output
+ * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ */
+template <int lvl_buf_size, typename level_t>
+__device__ void gpuDecodeLevels(page_state_s* s,
+                                page_state_buffers_s* sb,
+                                int32_t target_leaf_count,
+                                level_t* const rep,
+                                level_t* const def,
+                                int t)
+{
+  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  constexpr int batch_size = 32;
+  int cur_leaf_count       = target_leaf_count;
+  while (!s->error && s->nz_count < target_leaf_count &&
+         s->input_value_count < s->num_input_values) {
+    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
+    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
+    __syncwarp();
+
+    // because the rep and def streams are encoded separately, we cannot request an exact
+    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
+    // levels we get.
+    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
+                                                 s->lvl_count[level_type::DEFINITION])
+                                           : s->lvl_count[level_type::DEFINITION];
+
+    // process what we got back
+    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size, level_t>(
+      actual_leaf_count, s, sb, rep, def, t);
+    cur_leaf_count = actual_leaf_count + batch_size;
+    __syncwarp();
+  }
+}
+
+/**
+ * @brief Parse the beginning of the level section (definition or repetition),
+ * initializes the initial RLE run & value, and returns the section length
+ *
+ * @param[in,out] s The page state
+ * @param[in] cur The current data position
+ * @param[in] end The end of the data
+ * @param[in] level_bits The bits required
+ * @param[in] is_decode_step True if we are performing the decode step.
+ * @param[in,out] decoders The repetition and definition level stream decoders
+ *
+ * @return The length of the section
+ */
+inline __device__ uint32_t InitLevelSection(page_state_s* s,
+                                            uint8_t const* cur,
+                                            uint8_t const* end,
+                                            level_type lvl)
+{
+  int32_t len;
+  int level_bits    = s->col.level_bits[lvl];
+  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
+                                                    : s->page.repetition_level_encoding;
+
+  auto start = cur;
+  if (level_bits == 0) {
+    len                       = 0;
+    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else if (encoding == Encoding::RLE) {
+    // V2 only uses RLE encoding, so only perform check here
+    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
+      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
+    } else if (cur + 4 < end) {
+      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+      cur += 4;
+    } else {
+      len      = 0;
+      s->error = 2;
+    }
+    s->abs_lvl_start[lvl] = cur;
+    if (!s->error) {
+      uint32_t run            = get_vlq32(cur, end);
+      s->initial_rle_run[lvl] = run;
+      if (!(run & 1)) {
+        int v = (cur < end) ? cur[0] : 0;
+        cur++;
+        if (level_bits > 8) {
+          v |= ((cur < end) ? cur[0] : 0) << 8;
+          cur++;
+        }
+        s->initial_rle_value[lvl] = v;
+      }
+      s->lvl_start[lvl] = cur;
+    }
+
+    if (cur > end) { s->error = 2; }
+  } else if (encoding == Encoding::BIT_PACKED) {
+    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
+    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else {
+    s->error = 3;
+    len      = 0;
+  }
+
+  s->abs_lvl_end[lvl] = start + len;
+
+  return static_cast<uint32_t>(len);
+}
+
+/**
+ * @brief Functor for setupLocalPageInfo that always returns true.
+ */
+struct all_types_filter {
+  __device__ inline bool operator()(PageInfo const& page) { return true; }
+};
+
+/**
+ * @brief Functor for setupLocalPageInfo that returns true if this is not a string column.
+ */
+struct non_string_filter {
+  device_span<ColumnChunkDesc const> chunks;
+
+  __device__ inline bool operator()(PageInfo const& page) { return !is_string_col(page, chunks); }
+};
+
+/**
+ * @brief Functor for setupLocalPageInfo that returns true if this is a string column.
+ */
+struct string_filter {
+  device_span<ColumnChunkDesc const> chunks;
+
+  __device__ inline bool operator()(PageInfo const& page) { return is_string_col(page, chunks); }
+};
+
+/**
+ * @brief Sets up block-local page state information from the global pages.
+ *
+ * @param[in, out] s The local page state to be filled in
+ * @param[in] p The global page to be copied from
+ * @param[in] chunks The global list of chunks
+ * @param[in] min_row Crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] filter Filtering function used to decide which pages to operate on
+ * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
+ * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
+ * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should
+ * be operated on Currently only used by gpuComputePageSizes step)
+ */
+template <typename Filter>
+inline __device__ bool setupLocalPageInfo(page_state_s* const s,
+                                          PageInfo const* p,
+                                          device_span<ColumnChunkDesc const> chunks,
+                                          size_t min_row,
+                                          size_t num_rows,
+                                          Filter filter,
+                                          bool is_decode_step)
+{
+  int t = threadIdx.x;
+
+  // Fetch page info
+  if (!t) {
+    s->page         = *p;
+    s->nesting_info = nullptr;
+    s->col          = chunks[s->page.chunk_idx];
+  }
+  __syncthreads();
+
+  // return false if this is a dictionary page or it does not pass the filter condition
+  if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || !filter(s->page)) { return false; }
+
+  // our starting row (absolute index) is
+  // col.start_row == absolute row index
+  // page.chunk-row == relative row index within the chunk
+  size_t const page_start_row = s->col.start_row + s->page.chunk_row;
+
+  // if we can use the nesting decode cache, set it up now
+  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
+  if (can_use_decode_cache) {
+    int depth = 0;
+    while (depth < s->page.nesting_info_size) {
+      int const thread_depth = depth + t;
+      if (thread_depth < s->page.nesting_info_size) {
+        // these values need to be copied over from global
+        s->nesting_decode_cache[thread_depth].max_def_level =
+          s->page.nesting_decode[thread_depth].max_def_level;
+        s->nesting_decode_cache[thread_depth].page_start_value =
+          s->page.nesting_decode[thread_depth].page_start_value;
+        s->nesting_decode_cache[thread_depth].start_depth =
+          s->page.nesting_decode[thread_depth].start_depth;
+        s->nesting_decode_cache[thread_depth].end_depth =
+          s->page.nesting_decode[thread_depth].end_depth;
+      }
+      depth += blockDim.x;
+    }
+  }
+
+  if (!t) {
+    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
+
+    // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
+    // invalid/bogus during first pass of the preprocess step for nested types. this is ok
+    // because we ignore these values in that stage.
+    auto const max_row = min_row + num_rows;
+
+    // if we are totally outside the range of the input, do nothing
+    if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
+      s->first_row = 0;
+      s->num_rows  = 0;
+    }
+    // otherwise
+    else {
+      s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
+      auto const max_page_rows = s->page.num_rows - s->first_row;
+      s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
+                                   ? max_page_rows
+                                   : max_row - (page_start_row + s->first_row);
+    }
+  }
+
+  __syncthreads();
+
+  // zero counts
+  int depth = 0;
+  while (depth < s->page.num_output_nesting_levels) {
+    int const thread_depth = depth + t;
+    if (thread_depth < s->page.num_output_nesting_levels) {
+      s->nesting_info[thread_depth].valid_count = 0;
+      s->nesting_info[thread_depth].value_count = 0;
+      s->nesting_info[thread_depth].null_count  = 0;
+    }
+    depth += blockDim.x;
+  }
+  __syncthreads();
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  // NOTE: this check needs to be done after the null counts have been zeroed out
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  if (is_decode_step && s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
+    return false;
+  }
+
+  if (!t) {
+    s->error = 0;
+
+    // IMPORTANT : nested schemas can have 0 rows in a page but still have
+    // values. The case is:
+    // - On page N-1, the last row starts, with 2/6 values encoded
+    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
+    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
+    if (s->page.num_input_values > 0) {
+      uint8_t* cur = s->page.page_data;
+      uint8_t* end = cur + s->page.uncompressed_page_size;
+
+      uint32_t dtype_len_out = s->col.data_type >> 3;
+      s->ts_scale            = 0;
+      // Validate data type
+      auto const data_type = s->col.data_type & 7;
+      switch (data_type) {
+        case BOOLEAN:
+          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
+          break;
+        case INT32: [[fallthrough]];
+        case FLOAT: s->dtype_len = 4; break;
+        case INT64:
+          if (s->col.ts_clock_rate) {
+            int32_t units = 0;
+            // Duration types are not included because no scaling is done when reading
+            if (s->col.converted_type == TIMESTAMP_MILLIS) {
+              units = cudf::timestamp_ms::period::den;
+            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
+              units = cudf::timestamp_us::period::den;
+            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+              units = cudf::timestamp_ns::period::den;
+            }
+            if (units and units != s->col.ts_clock_rate) {
+              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
+                                                           : (s->col.ts_clock_rate / units);
+            }
+          }
+          [[fallthrough]];
+        case DOUBLE: s->dtype_len = 8; break;
+        case INT96: s->dtype_len = 12; break;
+        case BYTE_ARRAY:
+          if (s->col.converted_type == DECIMAL) {
+            auto const decimal_precision = s->col.decimal_precision;
+            s->dtype_len                 = [decimal_precision]() {
+              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+                return sizeof(int32_t);
+              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+                return sizeof(int64_t);
+              } else {
+                return sizeof(__int128_t);
+              }
+            }();
+          } else {
+            s->dtype_len = sizeof(string_index_pair);
+          }
+          break;
+        default:  // FIXED_LEN_BYTE_ARRAY:
+          s->dtype_len = dtype_len_out;
+          s->error |= (s->dtype_len <= 0);
+          break;
+      }
+      // Special check for downconversions
+      s->dtype_len_in = s->dtype_len;
+      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
+        s->dtype_len = [dtype_len = s->dtype_len]() {
+          if (dtype_len <= sizeof(int32_t)) {
+            return sizeof(int32_t);
+          } else if (dtype_len <= sizeof(int64_t)) {
+            return sizeof(int64_t);
+          } else {
+            return sizeof(__int128_t);
+          }
+        }();
+      } else if (data_type == INT32) {
+        if (dtype_len_out == 1) {
+          // INT8 output
+          s->dtype_len = 1;
+        } else if (dtype_len_out == 2) {
+          // INT16 output
+          s->dtype_len = 2;
+        } else if (s->col.converted_type == TIME_MILLIS) {
+          // INT64 output
+          s->dtype_len = 8;
+        }
+      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
+        s->dtype_len = 4;  // HASH32 output
+      } else if (data_type == INT96) {
+        s->dtype_len = 8;  // Convert to 64-bit timestamp
+      }
+
+      // during the decoding step we need to offset the global output buffers
+      // for each level of nesting so that we write to the section this page
+      // is responsible for.
+      // - for flat schemas, we can do this directly by using row counts
+      // - for nested schemas, these offsets are computed during the preprocess step
+      //
+      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
+      // will be aliased to memory that has been freed when we get here in the non-decode step, so
+      // we cannot check against nullptr.  we'll just check a flag directly.
+      if (is_decode_step) {
+        int max_depth = s->col.max_nesting_depth;
+        for (int idx = 0; idx < max_depth; idx++) {
+          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
+
+          size_t output_offset;
+          // schemas without lists
+          if (s->col.max_level[level_type::REPETITION] == 0) {
+            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
+          }
+          // for schemas with lists, we've already got the exact value precomputed
+          else {
+            output_offset = nesting_info->page_start_value;
+          }
+
+          if (s->col.column_data_base != nullptr) {
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+            if (s->col.column_string_base != nullptr) {
+              nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
+            }
+
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
+            if (nesting_info->data_out != nullptr) {
+              // anything below max depth with a valid data pointer must be a list, so the
+              // element size is the size of the offset type.
+              uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
+              // if this is a string column, then dtype_len is a lie. data will be offsets rather
+              // than (ptr,len) tuples.
+              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
+              nesting_info->data_out += (output_offset * len);
+            }
+            if (nesting_info->string_out != nullptr) {
+              nesting_info->string_out += s->page.str_offset;
+            }
+            nesting_info->valid_map = s->col.valid_map_base[idx];
+            if (nesting_info->valid_map != nullptr) {
+              nesting_info->valid_map += output_offset >> 5;
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+            }
+          }
+        }
+      }
+      s->first_output_value = 0;
+
+      // Find the compressed size of repetition levels
+      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
+      // Find the compressed size of definition levels
+      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
+
+      s->dict_bits = 0;
+      s->dict_base = nullptr;
+      s->dict_size = 0;
+      // NOTE:  if additional encodings are supported in the future, modifications must
+      // be made to is_supported_encoding() in reader_impl_preprocess.cu
+      switch (s->page.encoding) {
+        case Encoding::PLAIN_DICTIONARY:
+        case Encoding::RLE_DICTIONARY:
+          // RLE-packed dictionary indices, first byte indicates index length in bits
+          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
+            // String dictionary: use index
+            s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
+            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
+          } else {
+            s->dict_base =
+              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
+            s->dict_size = s->col.page_info[0].uncompressed_page_size;
+          }
+          s->dict_run  = 0;
+          s->dict_val  = 0;
+          s->dict_bits = (cur < end) ? *cur++ : 0;
+          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
+          break;
+        case Encoding::PLAIN:
+          s->dict_size = static_cast<int32_t>(end - cur);
+          s->dict_val  = 0;
+          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
+          break;
+        case Encoding::RLE: s->dict_run = 0; break;
+        default:
+          s->error = 1;  // Unsupported encoding
+          break;
+      }
+      if (cur > end) { s->error = 1; }
+      s->lvl_end    = cur;
+      s->data_start = cur;
+      s->data_end   = end;
+    } else {
+      s->error = 1;
+    }
+
+    s->lvl_count[level_type::REPETITION] = 0;
+    s->lvl_count[level_type::DEFINITION] = 0;
+    s->nz_count                          = 0;
+    s->num_input_values                  = s->page.num_input_values;
+    s->dict_pos                          = 0;
+    s->src_pos                           = 0;
+
+    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
+    // preprocess of the definition levels (since nulls will have no actual decodable value, there
+    // is no direct correlation between # of rows and # of decodable values).  so we will start
+    // processing at the beginning of the value stream and disregard any indices that start
+    // before the first row.
+    if (s->col.max_level[level_type::REPETITION] == 0) {
+      s->page.skipped_values      = 0;
+      s->page.skipped_leaf_values = 0;
+      s->input_value_count        = 0;
+      s->input_row_count          = 0;
+      s->input_leaf_count         = 0;
+
+      s->row_index_lower_bound = -1;
+    }
+    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
+    // we need to start decoding at
+    else {
+      // input_row_count translates to "how many rows we have processed so far", so since we are
+      // skipping directly to where we want to start decoding, set it to first_row
+      s->input_row_count = s->first_row;
+
+      // return the lower bound to compare (page-relative) thread row index against. Explanation:
+      // In the case of nested schemas, rows can span page boundaries.  That is to say,
+      // we can encounter the first value for row X on page M, but the last value for page M
+      // might not be the last value for row X. page M+1 (or further) may contain the last value.
+      //
+      // This means that the first values we encounter for a given page (M+1) may not belong to the
+      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
+      // previous row is within the overall row bounds, include the values by allowing relative row
+      // index -1
+      int const max_row = (min_row + num_rows) - 1;
+      if (min_row < page_start_row && max_row >= page_start_row - 1) {
+        s->row_index_lower_bound = -1;
+      } else {
+        s->row_index_lower_bound = s->first_row;
+      }
+
+      // if we're in the decoding step, jump directly to the first
+      // value we care about
+      if (is_decode_step) {
+        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
+      } else {
+        s->input_value_count = 0;
+        s->input_leaf_count  = 0;
+        s->page.skipped_values =
+          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
+        s->page.skipped_leaf_values = 0;
+      }
+    }
+
+    __threadfence_block();
+  }
+  __syncthreads();
+
+  return true;
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
new file mode 100644
index 00000000000..9173d408192
--- /dev/null
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -0,0 +1,865 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_decode.cuh"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/gather.cuh>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+namespace {
+
+// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
+// copies from src to dst in 16B chunks per thread.
+__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  using cudf::strings::detail::load_uint4;
+
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
+  auto const in_start         = src;
+
+  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
+  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
+  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
+  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
+  // `load_uint4` reading beyond string boundaries.
+  // use signed int since out_end_aligned can be negative.
+  int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
+                                out_datatype_size * out_datatype_size -
+                              alignment_offset;
+  int64_t out_end_aligned =
+    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+    alignment_offset;
+
+  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
+       ichar += warp_size * out_datatype_size) {
+    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+      load_uint4((const char*)in_start + ichar);
+  }
+
+  // Tail logic: copy characters of the current string outside
+  // `[out_start_aligned, out_end_aligned)`.
+  if (out_end_aligned <= out_start_aligned) {
+    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+    // entire string.
+    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
+      dst[ichar] = in_start[ichar];
+    }
+  } else {
+    // Copy characters in range `[0, out_start_aligned)`.
+    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
+    // Copy characters in range `[out_end_aligned, len)`.
+    int64_t ichar = out_end_aligned + lane_id;
+    if (ichar < len) { dst[ichar] = in_start[ichar]; }
+  }
+}
+
+/**
+ * @brief char-parallel string copy.
+ */
+__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  if (len > 64) {
+    wideStrcpy(dst, src, len, lane_id);
+  } else {
+    for (int i = lane_id; i < len; i += warp_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+/**
+ * @brief Perform exclusive scan on an array of any length using a single block of threads.
+ */
+template <int block_size>
+__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
+{
+  using block_scan = cub::BlockScan<size_type, block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  // do a series of block sums, storing results in arr as we go
+  for (int pos = 0; pos < length; pos += block_size) {
+    int const tidx = pos + t;
+    size_type tval = tidx < length ? arr[tidx] : 0;
+    size_type block_sum;
+    block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
+    if (tidx < length) { arr[tidx] = tval; }
+    initial_value += block_sum;
+  }
+}
+
+/**
+ * @brief Compute the start and end page value bounds for this page
+ *
+ * This uses definition and repetition level info to determine the number of valid and null
+ * values for the page, taking into account skip_rows/num_rows (if set).
+ *
+ * @param s The local page info
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param is_bounds_pg True if this page is clipped
+ * @param has_repetition True if the schema is nested
+ * @param decoders Definition and repetition level decoders
+ * @return pair containing start and end value indexes
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <int lvl_buf_size, typename level_t>
+__device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
+                                              size_t min_row,
+                                              size_t num_rows,
+                                              bool is_bounds_pg,
+                                              bool has_repetition,
+                                              rle_stream<level_t>* decoders)
+{
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  int const t = threadIdx.x;
+
+  // decode batches of level stream data using rle_stream objects and use the results to
+  // calculate start and end value positions in the encoded string data.
+  int const max_depth = s->col.max_nesting_depth;
+  int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
+
+  // can skip all this if we know there are no nulls
+  if (max_def == 0 && !is_bounds_pg) {
+    s->page.num_valids = s->num_input_values;
+    s->page.num_nulls  = 0;
+    return {0, s->num_input_values};
+  }
+
+  int start_value = 0;
+  int end_value   = s->page.num_input_values;
+  auto const pp   = &s->page;
+  auto const col  = &s->col;
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  int const max_batch_size = lvl_buf_size;
+  auto const def_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  auto const rep_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def_decode,
+                                        s->page.num_input_values);
+  // only need repetition if this is a bounds page. otherwise all we need is def level info
+  // to count the nulls.
+  if (has_repetition && is_bounds_pg) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep_decode,
+                                          s->page.num_input_values);
+  }
+
+  int processed = 0;
+
+  // if this is a bounds page, we need to do extra work to find the start and/or end value index
+  if (is_bounds_pg) {
+    __shared__ int skipped_values;
+    __shared__ int skipped_leaf_values;
+    __shared__ int last_input_value;
+    __shared__ int end_val_idx;
+
+    // need these for skip_rows case
+    auto const page_start_row = col->start_row + pp->chunk_row;
+    auto const max_row        = min_row + num_rows;
+    auto const begin_row      = page_start_row >= min_row ? 0 : min_row - page_start_row;
+    auto const max_page_rows  = pp->num_rows - begin_row;
+    auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
+                                  ? max_page_rows
+                                  : max_row - (page_start_row + begin_row);
+    auto end_row              = begin_row + page_rows;
+    int row_fudge             = -1;
+
+    // short circuit for no nulls
+    if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
+
+    int row_count           = 0;
+    int leaf_count          = 0;
+    bool skipped_values_set = false;
+    bool end_value_set      = false;
+
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+
+      if (has_repetition) {
+        decoders[level_type::REPETITION].decode_next(t);
+        __syncthreads();
+
+        // special case where page does not begin at a row boundary
+        if (processed == 0 && rep_decode[0] != 0) {
+          if (t == 0) {
+            skipped_values      = 0;
+            skipped_leaf_values = 0;
+          }
+          skipped_values_set = true;
+          end_row++;  // need to finish off the previous row
+          row_fudge = 0;
+        }
+      }
+
+      // the # of rep/def levels will always be the same size
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      // do something with the level data
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        int idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
+
+        // get absolute thread row index
+        int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
+        int thread_row_count, block_row_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+        __syncthreads();
+
+        // get absolute thread leaf index
+        int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def);
+        int thread_leaf_count, block_leaf_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+        __syncthreads();
+
+        // if we have not set skipped values yet, see if we found the first in-bounds row
+        if (!skipped_values_set && row_count + block_row_count > begin_row) {
+          // if this thread is in row bounds
+          int const row_index = thread_row_count + row_count - 1;
+          int const in_row_bounds =
+            idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(in_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the first row. need to test in_row_bounds for
+            // the case where we only want one row and local_count == 1 for many threads.
+            if (local_count == 1 && in_row_bounds) {
+              skipped_values = idx_t;
+              skipped_leaf_values =
+                leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            skipped_values_set = true;
+          }
+        }
+
+        // test if row_count will exceed end_row in this batch
+        if (!end_value_set && row_count + block_row_count >= end_row) {
+          // if this thread exceeds row bounds. row_fudge change depending on whether we've faked
+          // the end row to account for starting a page in the middle of a row.
+          int const row_index          = thread_row_count + row_count + row_fudge;
+          int const exceeds_row_bounds = row_index >= end_row;
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(exceeds_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the end row.
+            if (local_count == 1) {
+              last_input_value = idx_t;
+              end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            end_value_set = true;
+            break;
+          }
+        }
+
+        row_count += block_row_count;
+        leaf_count += block_leaf_count;
+
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+      if (end_value_set) { break; }
+    }
+
+    start_value = skipped_values_set ? skipped_leaf_values : 0;
+    end_value   = end_value_set ? end_val_idx : leaf_count;
+
+    if (t == 0) {
+      int const v0                = skipped_values_set ? skipped_values : 0;
+      int const vn                = end_value_set ? last_input_value : s->num_input_values;
+      int const total_values      = vn - v0;
+      int const total_leaf_values = end_value - start_value;
+      int const num_nulls         = total_values - total_leaf_values;
+      pp->num_nulls               = num_nulls;
+      pp->num_valids              = total_leaf_values;
+    }
+  }
+  // already filtered out unwanted pages, so need to count all non-null values in this page
+  else {
+    int num_nulls = 0;
+    while (processed < s->page.num_input_values) {
+      int start_val = processed;
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      while (start_val < processed) {
+        int idx_t = start_val + t;
+        if (idx_t < processed) {
+          int idx = rolling_lvl_index<lvl_buf_size>(idx_t);
+          if (def_decode[idx] < max_def) { num_nulls++; }
+        }
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+    }
+
+    int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
+
+    if (t == 0) {
+      pp->num_nulls  = null_count;
+      pp->num_valids = pp->num_input_values - null_count;
+    }
+    __syncthreads();
+
+    end_value -= pp->num_nulls;
+  }
+
+  return {start_value, end_value};
+}
+
+/**
+ * @brief Compute string size information for dictionary encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param dict_base Pointer to the start of the dictionary
+ * @param dict_bits The number of bits used to in the dictionary bit packing
+ * @param dict_size Size of the dictionary in bytes
+ * @param data_size Size of the page data in bytes
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ */
+__device__ size_t totalDictEntriesSize(uint8_t const* data,
+                                       uint8_t const* dict_base,
+                                       int dict_bits,
+                                       int dict_size,
+                                       int data_size,
+                                       int start_value,
+                                       int end_value)
+{
+  int const t              = threadIdx.x;
+  uint8_t const* ptr       = data;
+  uint8_t const* const end = data + data_size;
+  int const bytecnt        = (dict_bits + 7) >> 3;
+  size_t l_str_len         = 0;  // partial sums across threads
+  int pos                  = 0;  // current value index in the data stream
+  int t0                   = 0;  // thread 0 for this batch
+
+  int dict_run = 0;
+  int dict_val = 0;
+
+  while (pos < end_value && ptr <= end) {
+    if (dict_run <= 1) {
+      dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
+      if (!(dict_run & 1)) {
+        // Repeated value
+        if (ptr + bytecnt <= end) {
+          int32_t run_val = ptr[0];
+          if (bytecnt > 1) {
+            run_val |= ptr[1] << 8;
+            if (bytecnt > 2) {
+              run_val |= ptr[2] << 16;
+              if (bytecnt > 3) { run_val |= ptr[3] << 24; }
+            }
+          }
+          dict_val = run_val & ((1 << dict_bits) - 1);
+        }
+        ptr += bytecnt;
+      }
+    }
+
+    int batch_len;
+    if (dict_run & 1) {
+      // Literal batch: must output a multiple of 8, except for the last batch
+      int batch_len_div8;
+      batch_len      = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1);
+      batch_len_div8 = (batch_len + 7) >> 3;
+      dict_run -= batch_len_div8 * 2;
+      ptr += batch_len_div8 * dict_bits;
+    } else {
+      batch_len = dict_run >> 1;
+      dict_run  = 0;
+    }
+
+    int const is_literal = dict_run & 1;
+
+    // calculate my thread id for this batch.  way to round-robin the work.
+    int mytid = t - t0;
+    if (mytid < 0) mytid += preprocess_block_size;
+
+    // compute dictionary index.
+    if (is_literal) {
+      int dict_idx = 0;
+      if (mytid < batch_len) {
+        dict_idx         = dict_val;
+        int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
+        const uint8_t* p = ptr + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+
+        if (pos + mytid < end_value) {
+          uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+          if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
+            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+            l_str_len += src->second;
+          }
+        }
+      }
+
+      t0 += batch_len;
+    } else {
+      int const start_off =
+        (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
+      batch_len = min(batch_len, end_value - pos);
+      if (mytid == 0) {
+        uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
+        if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+          l_str_len += (batch_len - start_off) * src->second;
+        }
+      }
+
+      t0 += 1;
+    }
+
+    t0 = t0 % preprocess_block_size;
+    pos += batch_len;
+  }
+  __syncthreads();
+
+  using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+  size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
+
+  return sum_l;
+}
+
+/**
+ * @brief Compute string size information for plain encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param data_size Length of data
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ */
+__device__ size_t totalPlainEntriesSize(uint8_t const* data,
+                                        int data_size,
+                                        int start_value,
+                                        int end_value)
+{
+  int const t      = threadIdx.x;
+  int pos          = 0;
+  size_t total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    const uint8_t* cur = data;
+    int k              = 0;
+
+    while (pos < end_value && k < data_size) {
+      int len;
+      if (k + 4 <= data_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > data_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+
+      k += len;
+      if (pos >= start_value) { total_len += len; }
+      pos++;
+    }
+  }
+
+  return total_len;
+}
+
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
+ * are updated. This call ignores non-string columns.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <int lvl_buf_size, typename level_t>
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  // only count if it's a string column
+  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // reset str_bytes to 0 in case it's already been calculated
+  if (t == 0) { pp->str_bytes = 0; }
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[run_buffer_size];
+  rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, string_filter{chunks}, false)) {
+    return;
+  }
+
+  if (!t) {
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    s->page.str_bytes  = 0;
+  }
+  __syncthreads();
+
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+
+  // if we're skipping this page anyway, no need to count it
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
+
+  // find start/end value indices
+  auto const [start_value, end_value] =
+    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
+
+  // need to save num_nulls and num_valids calculated in page_bounds in this page
+  if (t == 0) {
+    pp->num_nulls  = s->page.num_nulls;
+    pp->num_valids = s->page.num_valids;
+  }
+
+  // now process string info in the range [start_value, end_value)
+  // set up for decoding strings...can be either plain or dictionary
+  auto const& col          = s->col;
+  uint8_t const* data      = s->data_start;
+  uint8_t const* const end = s->data_end;
+  uint8_t const* dict_base = nullptr;
+  int dict_size            = 0;
+  size_t str_bytes         = 0;
+
+  switch (pp->encoding) {
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE_DICTIONARY:
+      // RLE-packed dictionary indices, first byte indicates index length in bits
+      if (col.str_dict_index) {
+        // String dictionary: use index
+        dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+        dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+      } else {
+        dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
+        dict_size = col.page_info[0].uncompressed_page_size;
+      }
+
+      // FIXME: need to return an error condition...this won't actually do anything
+      if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+
+      str_bytes = totalDictEntriesSize(
+        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
+      break;
+    case Encoding::PLAIN:
+      dict_size = static_cast<int32_t>(end - data);
+      str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
+                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
+      break;
+  }
+
+  if (t == 0) {
+    // TODO check for overflow
+    pp->str_bytes = str_bytes;
+  }
+}
+
+/**
+ * @brief Kernel for computing the string column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk.
+ *
+ * This version uses a single warp to do the string copies.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <int lvl_buf_size, typename level_t>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(4) size_type last_offset;
+
+  page_state_s* const s          = &state_g;
+  page_state_buffers_s* const sb = &state_buffers;
+  int const page_idx             = blockIdx.x;
+  int const t                    = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // offsets are local to the page
+  if (t == 0) { last_offset = 0; }
+  __syncthreads();
+
+  int const out_thread0                          = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+  int const leaf_level_index                     = s->col.max_nesting_depth - 1;
+  PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[lvl_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[lvl_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+      if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
+    }
+    __syncthreads();
+    if (t < 32) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
+      uint32_t src_target_pos = target_pos + skipped_leaf_values;
+
+      // WARP1: Decode dictionary indices, booleans or string positions
+      if (s->dict_base) {
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+      } else {
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+      }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+    } else {
+      int const me = t - out_thread0;
+
+      // WARP1..WARP3: Decode values
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -1, -2, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
+      // choose a character parallel string copy when the average string is longer than a warp
+      using cudf::detail::warp_size;
+      auto const use_char_ll =
+        s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
+
+      if (me < warp_size) {
+        for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
+          dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+          if (!has_repetition) { dst_pos -= s->first_row; }
+
+          auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
+                              ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
+                              : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
+          size_type offset;
+          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+          offset += last_offset;
+
+          if (use_char_ll) {
+            __shared__ __align__(8) uint8_t const* pointers[warp_size];
+            __shared__ __align__(4) size_type offsets[warp_size];
+            __shared__ __align__(4) int dsts[warp_size];
+            __shared__ __align__(4) int lengths[warp_size];
+
+            offsets[me]  = offset;
+            pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
+            dsts[me]     = dst_pos;
+            lengths[me]  = len;
+            __syncwarp();
+
+            for (int ss = 0; ss < warp_size && ss + i + s->src_pos < target_pos; ss++) {
+              if (dsts[ss] >= 0) {
+                auto offptr =
+                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                  dsts[ss];
+                *offptr      = lengths[ss];
+                auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss];
+                ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+              }
+            }
+
+          } else {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
+              memcpy(str_ptr, ptr, len);
+            }
+            __syncwarp();
+          }
+
+          // last thread in warp updates last_offset
+          if (me == warp_size - 1) { last_offset = offset + len; }
+          __syncwarp();
+        }
+      }
+
+      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
+
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ */
+void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            int level_type_size,
+                            rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  if (level_type_size == 1) {
+    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ */
+void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodeStringPageData<non_zero_buffer_size, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 7b93b527540..25d2885b7da 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -112,6 +112,7 @@ struct PageNestingDecodeInfo {
   int32_t valid_count;
   int32_t value_count;
   uint8_t* data_out;
+  uint8_t* string_out;
   bitmask_type* valid_map;
 };
 
@@ -162,9 +163,11 @@ struct PageInfo {
   // - In the case of a nested schema, you have to decode the repetition and definition
   //   levels to extract actual column values
   int32_t num_input_values;
-  int32_t chunk_row;       // starting row of this page relative to the start of the chunk
-  int32_t num_rows;        // number of rows in this page
-  int32_t num_nulls;       // number of null values (V2 header)
+  int32_t chunk_row;  // starting row of this page relative to the start of the chunk
+  int32_t num_rows;   // number of rows in this page
+  // the next two are calculated in gpuComputePageStringSizes
+  int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
+  int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -187,6 +190,7 @@ struct PageInfo {
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
   int32_t str_bytes;
+  int32_t str_offset;  // offset into string data for this page
 
   // nesting information (input/output) for each page. this array contains
   // input column nesting information, output column nesting information and
@@ -241,6 +245,7 @@ struct ColumnChunkDesc {
       str_dict_index(nullptr),
       valid_map_base{nullptr},
       column_data_base{nullptr},
+      column_string_base{nullptr},
       codec(codec_),
       converted_type(converted_type_),
       logical_type(logical_type_),
@@ -270,6 +275,7 @@ struct ColumnChunkDesc {
   string_index_pair* str_dict_index;          // index for string dictionary
   bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
   void** column_data_base;                    // base pointers of column data
+  void** column_string_base;                  // base pointers of column string data
   int8_t codec;                               // compressed codec enum
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
@@ -418,6 +424,15 @@ struct EncPage {
   compression_result* comp_res;  //!< Ptr to compression result
 };
 
+/**
+ * @brief Test if the given column chunk is in a string column
+ */
+constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+{
+  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
+         chunk.converted_type != DECIMAL;
+}
+
 /**
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
@@ -472,6 +487,28 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                       int level_type_size,
                       rmm::cuda_stream_view stream);
 
+/**
+ * @brief Compute string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, `num_valids`, and `str_offset` fields of the
+ * PageInfo struct are updated. This call ignores non-string columns.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] min_rows crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use, default 0
+ */
+void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            int level_type_size,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the column data stored in the pages
  *
@@ -492,6 +529,26 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                     int level_type_size,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the string column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use, default 0
+ */
+void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                          cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          int level_type_size,
+                          rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7a6e11537d6..0237bf820b0 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -17,11 +17,30 @@
 #include "reader_impl.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
+namespace {
+
+int constexpr NUM_DECODERS       = 2;  // how many decode kernels are there to run
+int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
+int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
+
+auto& get_stream_pool()
+{
+  // TODO: creating this on the heap because there were issues with trying to call the
+  // stream pool destructor during cuda shutdown that lead to a segmentation fault in
+  // nvbench. this allocation is being deliberately leaked to avoid the above, but still
+  // results in non-fatal warnings when running nvbench in cuda-gdb.
+  static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
+  return *pool;
+}
+
+}  // namespace
+
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
   auto& chunks              = _file_itm_data.chunks;
@@ -37,6 +56,29 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
+  // Check to see if there are any string columns present. If so, then we need to get size info
+  // for each string page. This size info will be used to pre-allocate memory for the column,
+  // allowing the page decoder to write string data directly to the column buffer, rather than
+  // doing a gather operation later on.
+  // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
+  // chunked reader).
+  auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col);
+
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  if (has_strings) {
+    gpu::ComputePageStringSizes(
+      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+
+    col_sizes = calculate_page_string_offsets();
+
+    // check for overflow
+    if (std::any_of(col_sizes.begin(), col_sizes.end(), [](size_t sz) {
+          return sz > std::numeric_limits<size_type>::max();
+        })) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+  }
+
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
@@ -44,6 +86,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     cudf::detail::hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
   auto chunk_nested_data = cudf::detail::hostdevice_vector<void*>(sum_max_depths, _stream);
   auto chunk_offsets     = std::vector<size_t>();
+  auto chunk_nested_str_data =
+    cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
   for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
@@ -64,6 +108,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     auto data                  = chunk_nested_data.host_ptr(chunk_off);
     chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
+    auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
+    chunks[c].column_string_base =
+      has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr;
+
     chunk_off += max_depth;
 
     // fill in the arrays on the host.  there are some important considerations to
@@ -106,6 +154,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
+        // only do string buffer for leaf
+        if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
+          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+        }
+        if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
           static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       } else {
@@ -121,8 +174,18 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
+  _stream.synchronize();
 
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream);
+  auto stream1 = get_stream_pool().get_stream();
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+  if (has_strings) {
+    auto stream2 = get_stream_pool().get_stream();
+    chunk_nested_str_data.host_to_device_async(stream2);
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
+    stream2.synchronize();
+  }
+  stream1.synchronize();
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
@@ -144,21 +207,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
-      if (out_buf.type.id() != type_id::LIST ||
-          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
-        continue;
+      if (out_buf.type.id() == type_id::LIST &&
+          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED) == 0) {
+        CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
+        auto const& child = (*cols)[input_col.nesting[l_idx + 1]];
+
+        // the final offset for a list at level N is the size of it's child
+        int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                                      &offset,
+                                      sizeof(offset),
+                                      cudaMemcpyDefault,
+                                      _stream.value()));
+        out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
+      } else if (out_buf.type.id() == type_id::STRING) {
+        // need to cap off the string offsets column
+        size_type const sz = static_cast<size_type>(col_sizes[idx]);
+        cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
+                        &sz,
+                        sizeof(size_type),
+                        cudaMemcpyDefault,
+                        _stream.value());
       }
-      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
-
-      // the final offset for a list at level N is the size of it's child
-      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                                    &offset,
-                                    sizeof(offset),
-                                    cudaMemcpyDefault,
-                                    _stream.value()));
-      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }
 
@@ -232,7 +302,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Don't need to do it if we read the file all at once.
   if (_chunk_read_limit > 0) {
     for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 }
@@ -350,7 +420,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_read_limit > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 4d627c41433..d25bf1e4c1f 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -24,8 +24,6 @@
 #include "parquet_gpu.hpp"
 #include "reader_impl_helpers.hpp"
 
-#include <io/utilities/column_buffer.hpp>
-
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
@@ -38,6 +36,7 @@
 #include <vector>
 
 namespace cudf::io::detail::parquet {
+
 /**
  * @brief Implementation for Parquet reader
  */
@@ -221,6 +220,13 @@ class reader::impl {
    */
   void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
 
+  /**
+   * @brief Calculate per-page offsets for string data
+   *
+   * @return Vector of total string data sizes for each column
+   */
+  std::vector<size_t> calculate_page_string_offsets();
+
   /**
    * @brief Converts the page data and outputs to columns.
    *
@@ -240,10 +246,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<column_buffer> _output_buffers;
+  std::vector<inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<column_buffer> _output_buffers_template;
+  std::vector<inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 76ce7cb68bd..006b8d69aad 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -383,7 +383,9 @@ aggregate_reader_metadata::select_row_groups(
   return {rows_to_skip, rows_to_read, std::move(selection)};
 }
 
-std::tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+std::tuple<std::vector<input_column_info>,
+           std::vector<inline_column_buffer>,
+           std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
                                           bool strings_to_categorical,
@@ -400,17 +402,17 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<column_buffer> output_columns;
+  std::vector<inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
+  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<column_buffer>& out_col_array,
+                       std::vector<inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -431,7 +433,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -471,7 +473,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 748f0164244..0192dcd373b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -188,7 +188,7 @@ class aggregate_reader_metadata {
    * indices
    */
   [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
     select_columns(std::optional<std::vector<std::string>> const& use_names,
                    bool include_index,
                    bool strings_to_categorical,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 695b1252872..cc2758a6e4e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1259,8 +1259,12 @@ struct get_page_num_rows {
   __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
 };
 
-struct get_page_schema {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; }
+struct get_page_column_index {
+  gpu::ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(gpu::PageInfo const& page)
+  {
+    return chunks[page.chunk_idx].src_col_index;
+  }
 };
 
 struct input_col_info {
@@ -1483,6 +1487,43 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
   }
 }
 
+struct page_to_string_size {
+  gpu::PageInfo* pages;
+  gpu::ColumnChunkDesc const* chunks;
+
+  __device__ size_t operator()(size_type page_idx) const
+  {
+    auto const page  = pages[page_idx];
+    auto const chunk = chunks[page.chunk_idx];
+
+    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
+      return 0;
+    }
+    return pages[page_idx].str_bytes;
+  }
+};
+
+struct page_offset_output_iter {
+  gpu::PageInfo* p;
+  size_type const* index;
+
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  __host__ __device__ page_offset_output_iter operator+(int i)
+  {
+    return page_offset_output_iter{p, index + i};
+  }
+
+  __host__ __device__ void operator++() { index++; }
+
+  __device__ reference operator[](int i) { return p[index[i]].str_offset; }
+  __device__ reference operator*() { return p[*index].str_offset; }
+};
+
 }  // anonymous namespace
 
 void reader::impl::preprocess_pages(size_t skip_rows,
@@ -1519,7 +1560,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                       pages.device_ptr(),
                       pages.device_ptr() + pages.size(),
                       page_keys.begin(),
-                      get_page_schema{});
+                      get_page_column_index{chunks.device_ptr()});
 
     thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
     thrust::stable_sort_by_key(rmm::exec_policy(_stream),
@@ -1638,16 +1679,16 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                                   page_input,
                                   chunk_row_output_iter{pages.device_ptr()});
 
-    // preserve page ordering data
-    _chunk_itm_data.page_keys  = std::move(page_keys);
-    _chunk_itm_data.page_index = std::move(page_index);
-
     // retrieve pages back
     pages.device_to_host_sync(_stream);
 
     // print_pages(pages, _stream);
   }
 
+  // preserve page ordering data for string decoder
+  _chunk_itm_data.page_keys  = std::move(page_keys);
+  _chunk_itm_data.page_index = std::move(page_index);
+
   // compute splits if necessary. otherwise return a single split representing
   // the whole file.
   _chunk_read_info = chunk_read_limit > 0
@@ -1790,4 +1831,44 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   }
 }
 
+std::vector<size_t> reader::impl::calculate_page_string_offsets()
+{
+  auto& chunks           = _file_itm_data.chunks;
+  auto& pages            = _file_itm_data.pages_info;
+  auto const& page_keys  = _chunk_itm_data.page_keys;
+  auto const& page_index = _chunk_itm_data.page_index;
+
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+
+  // use page_index to fetch page string sizes in the proper order
+  auto val_iter = thrust::make_transform_iterator(
+    page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
+
+  // do scan by key to calculate string offsets for each page
+  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                val_iter,
+                                page_offset_output_iter{pages.device_ptr(), page_index.data()});
+
+  // now sum up page sizes
+  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  thrust::reduce_by_key(rmm::exec_policy(_stream),
+                        page_keys.begin(),
+                        page_keys.end(),
+                        val_iter,
+                        reduce_keys.begin(),
+                        d_col_sizes.begin());
+
+  cudaMemcpyAsync(col_sizes.data(),
+                  d_col_sizes.data(),
+                  sizeof(size_t) * col_sizes.size(),
+                  cudaMemcpyDeviceToHost,
+                  _stream);
+  _stream.synchronize();
+
+  return col_sizes;
+}
+
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 18b17bc8611..9b8754d6318 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,43 +26,66 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
+namespace cudf::io::detail {
 
-void column_buffer::create(size_type _size,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
-  size = _size;
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // The contents of _strings will never be directly returned to the user.
+  // Due to the fact that make_strings_column copies the input data to
+  // produce its outputs, _strings is actually a temporary. As a result, we
+  // do not pass the provided mr to the call to
+  // make_zeroed_device_uvector_async here and instead let it use the
+  // default rmm memory resource.
+  _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+      size, stream, rmm::mr::get_current_device_resource()));
+}
 
-  switch (type.id()) {
-    case type_id::STRING:
-      // The contents of _strings will never be directly returned to the user.
-      // Due to the fact that make_strings_column copies the input data to
-      // produce its outputs, _strings is actually a temporary. As a result, we
-      // do not pass the provided mr to the call to
-      // make_zeroed_device_uvector_async here and instead let it use the
-      // default rmm memory resource.
-      _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-          size, stream, rmm::mr::get_current_device_resource()));
-      break;
+std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+{
+  // make_strings_column allocates new memory, it does not simply move
+  // from the inputs, so we need to pass it the memory resource given to
+  // the buffer on construction so that the memory is allocated using the
+  // resource that the calling code expected.
+  return make_strings_column(*_strings, stream, _mr);
+}
 
-    // list columns store a buffer of int32's as offsets to represent
-    // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // size + 1 for final offset. _string_data will be initialized later.
+  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+}
 
-    // struct columns store no data themselves.  just validity and children.
-    case type_id::STRUCT: break;
+void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+{
+  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
+}
 
-    default: _data = create_data(type, size, stream, mr); break;
-  }
-  if (is_nullable) {
-    _null_mask =
-      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-  }
-  this->mr = mr;
+std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+{
+  // no need for copies, just transfer ownership of the data_buffers to the columns
+  auto const state = mask_state::UNALLOCATED;
+  auto str_col =
+    _string_data.size() == 0
+      ? make_empty_column(data_type{type_id::INT8})
+      : std::make_unique<column>(data_type{type_id::INT8},
+                                 string_size(),
+                                 std::move(_string_data),
+                                 cudf::detail::create_null_mask(size, state, stream, _mr),
+                                 state_null_count(state, size),
+                                 std::vector<std::unique_ptr<column>>{});
+  auto offsets_col =
+    std::make_unique<column>(data_type{type_to_id<size_type>()},
+                             size + 1,
+                             std::move(_data),
+                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
+                             state_null_count(state, size + 1),
+                             std::vector<std::unique_ptr<column>>{});
+
+  return make_strings_column(
+    size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask));
 }
 
 namespace {
@@ -73,26 +96,55 @@ namespace {
  * @param buff The old output buffer
  * @param new_buff The new output buffer
  */
-void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff)
+template <class string_policy>
+void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
-    auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable));
+    auto& new_child = new_buff.children.emplace_back(string_policy(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
 
 }  // namespace
 
-column_buffer column_buffer::empty_like(column_buffer const& input)
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  size = _size;
+  _mr  = mr;
+
+  switch (type.id()) {
+    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+
+    // list columns store a buffer of int32's as offsets to represent
+    // their individual rows
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+
+    // struct columns store no data themselves.  just validity and children.
+    case type_id::STRUCT: break;
+
+    default: _data = create_data(type, size, stream, _mr); break;
+  }
+  if (is_nullable) {
+    _null_mask = cudf::detail::create_null_mask(
+      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+  }
+}
+
+template <class string_policy>
+string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
-  auto new_buff = column_buffer(input.type, input.is_nullable);
+  auto new_buff = string_policy(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
   return new_buff;
 }
 
-std::unique_ptr<column> make_column(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
@@ -111,10 +163,10 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
         // from the inputs, so we need to pass it the memory resource given to
         // the buffer on construction so that the memory is allocated using the
         // resource that the calling code expected.
-        return make_strings_column(*buffer._strings, stream, buffer.mr);
+        return buffer.make_string_column(stream);
       } else {
         // convert to binary
-        auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr);
+        auto const string_col = buffer.make_string_column(stream);
         auto const num_rows   = string_col->size();
         auto const null_count = string_col->null_count();
         auto col_content      = string_col->release();
@@ -160,7 +212,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], child_info, child_schema, stream);
+      auto child = make_column<string_policy>(buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -170,7 +222,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                buffer._null_count,
                                std::move(buffer._null_mask),
                                stream,
-                               buffer.mr);
+                               buffer._mr);
     } break;
 
     case type_id::STRUCT: {
@@ -190,7 +242,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                     : std::nullopt;
 
         output_children.emplace_back(
-          make_column(buffer.children[i], child_info, child_schema, stream));
+          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -198,7 +250,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                  buffer._null_count,
                                  std::move(buffer._null_mask),
                                  stream,
-                                 buffer.mr);
+                                 buffer._mr);
     } break;
 
     default: {
@@ -214,7 +266,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 /**
  * @copydoc cudf::io::detail::empty_like
  */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -235,7 +288,8 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = empty_like(buffer.children[0], child_info, stream, mr);
+      auto child =
+        cudf::io::detail::empty_like<string_policy>(buffer.children[0], child_info, stream, mr);
 
       // make the final list column
       return make_lists_column(
@@ -248,13 +302,14 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
       std::transform(buffer.children.begin(),
                      buffer.children.end(),
                      std::back_inserter(output_children),
-                     [&](column_buffer& col) {
+                     [&](auto& col) {
                        column_name_info* child_info = nullptr;
                        if (schema_info != nullptr) {
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return empty_like(col, child_info, stream, mr);
+                       return cudf::io::detail::empty_like<string_policy>(
+                         col, child_info, stream, mr);
                      });
 
       return make_structs_column(
@@ -265,6 +320,34 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
   }
 }
 
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+using pointer_type = gather_column_buffer;
+using string_type  = inline_column_buffer;
+
+using pointer_column_buffer = column_buffer_base<pointer_type>;
+using string_column_buffer  = column_buffer_base<string_type>;
+
+template std::unique_ptr<column> make_column<string_type>(
+  string_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> make_column<pointer_type>(
+  pointer_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
+                                                         column_name_info* schema_info,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
+                                                          column_name_info* schema_info,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr);
+
+template class column_buffer_base<pointer_type>;
+template class column_buffer_base<string_type>;
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 1625a56d1c9..2ee7c17e480 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -62,42 +62,62 @@ inline rmm::device_buffer create_data(data_type type,
 
 using string_index_pair = thrust::pair<char const*, size_type>;
 
+// forward declare friend functions
+template <typename string_policy>
+class column_buffer_base;
+
 /**
- * @brief Class for holding device memory buffers to column data that eventually
- * will be used to create a column.
+ * @brief Creates a column from an existing set of device memory buffers.
+ *
+ * @throws std::bad_alloc if device memory allocation fails
+ *
+ * @param buffer Column buffer descriptors
+ * @param schema_info Schema information for the column to write optionally.
+ * @param schema Optional schema used to control string to binary conversions.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-struct column_buffer {
-  column_buffer() = default;
-
-  // construct without a known size. call create() later to actually
-  // allocate memory
-  column_buffer(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
-
-  // construct with a known size. allocates memory
-  column_buffer(data_type _type,
-                size_type _size,
-                bool _is_nullable,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
-    : column_buffer(_type, _is_nullable)
+template <class string_policy>
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
+                                    column_name_info* schema_info,
+                                    std::optional<reader_column_schema> const& schema,
+                                    rmm::cuda_stream_view stream);
+
+template <typename string_policy>
+class column_buffer_base {
+ public:
+  column_buffer_base() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
+
+  column_buffer_base(data_type _type,
+                     size_type _size,
+                     bool _is_nullable,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
+    : column_buffer_base(_type, _is_nullable)
   {
-    create(_size, stream, mr);
   }
 
   // move constructor
-  column_buffer(column_buffer&& col)            = default;
-  column_buffer& operator=(column_buffer&& col) = default;
+  column_buffer_base(column_buffer_base&& col)            = default;
+  column_buffer_base& operator=(column_buffer_base&& col) = default;
 
   // copy constructor
-  column_buffer(column_buffer const& col)            = delete;
-  column_buffer& operator=(column_buffer const& col) = delete;
+  column_buffer_base(column_buffer_base const& col)            = delete;
+  column_buffer_base& operator=(column_buffer_base const& col) = delete;
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
-  auto data() { return _strings ? _strings->data() : _data.data(); }
-  auto data_size() const { return _strings ? _strings->size() : _data.size(); }
+  // Create a new column_buffer that has empty data but with the same basic information as the
+  // input column, including same type, nullability, name, and user_data.
+  static string_policy empty_like(string_policy const& input);
+
+  void set_null_mask(rmm::device_buffer&& mask) { _null_mask = std::move(mask); }
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -105,43 +125,112 @@ struct column_buffer {
     return static_cast<T*>(_null_mask.data());
   }
   auto null_mask_size() { return _null_mask.size(); }
-
   auto& null_count() { return _null_count; }
 
-  // Create a new column_buffer that has empty data but with the same basic information as the
-  // input column, including same type, nullability, name, and user_data.
-  static column_buffer empty_like(column_buffer const& input);
+  auto data() { return static_cast<string_policy*>(this)->data_impl(); }
+  auto data() const { return static_cast<string_policy const*>(this)->data_impl(); }
+  auto data_size() const { return static_cast<string_policy const*>(this)->data_size_impl(); }
 
-  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
+  {
+    return static_cast<string_policy*>(this)->make_string_column_impl(stream);
+  }
+
+ protected:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
+  rmm::mr::device_memory_resource* _mr;
 
+ public:
   data_type type{type_id::EMPTY};
   bool is_nullable{false};
   size_type size{0};
-  std::vector<column_buffer> children;
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 
-  rmm::mr::device_memory_resource* mr;
+  std::vector<string_policy> children;
+
+  friend std::unique_ptr<column> make_column<string_policy>(
+    column_buffer_base& buffer,
+    column_name_info* schema_info,
+    std::optional<reader_column_schema> const& schema,
+    rmm::cuda_stream_view stream);
 };
 
-/**
- * @brief Creates a column from an existing set of device memory buffers.
- *
- * @throws std::bad_alloc if device memory allocation fails
- *
- * @param buffer Column buffer descriptors
- * @param schema_info Schema information for the column to write optionally.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return `std::unique_ptr<cudf::column>` Column from the existing device data
- */
-std::unique_ptr<column> make_column(column_buffer& buffer,
-                                    column_name_info* schema_info,
-                                    std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream);
+// column buffer that uses a string_index_pair for strings data, requiring a gather step when
+// creating a string column
+class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
+ public:
+  gather_column_buffer() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  gather_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<gather_column_buffer>(_type, _is_nullable)
+  {
+  }
+
+  gather_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
+  {
+    create(_size, stream, mr);
+  }
+
+  void allocate_strings_data(rmm::cuda_stream_view stream);
+
+  void* data_impl() { return _strings ? _strings->data() : _data.data(); }
+  void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
+  size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
+
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+
+ public:
+  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
+};
+
+// column buffer that stores string data internally which can be passed directly when
+// creating a string column
+class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
+ public:
+  inline_column_buffer() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  inline_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<inline_column_buffer>(_type, _is_nullable)
+  {
+  }
+
+  inline_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
+  {
+    create(_size, stream, mr);
+  }
+
+  void allocate_strings_data(rmm::cuda_stream_view stream);
+
+  void* data_impl() { return _data.data(); }
+  void const* data_impl() const { return _data.data(); }
+  size_t data_size_impl() const { return _data.size(); }
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
+  void* string_data() { return _string_data.data(); }
+  void const* string_data() const { return _string_data.data(); }
+  size_t string_size() const { return _string_data.size(); }
+
+ private:
+  rmm::device_buffer _string_data{};
+};
+
+using column_buffer = gather_column_buffer;
 
 /**
  * @brief Creates an equivalent empty column from an existing set of device memory buffers.
@@ -158,7 +247,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);