diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 08cc6ec4a14..0ac54113278 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
         hooks:
               - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
-        rev: 'v0.971'
+        rev: 'v1.3.0'
         hooks:
               - id: mypy
                 additional_dependencies: [types-cachetools]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5b8a0ea53ba..4031f1aa1c3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -31,8 +31,8 @@ dependencies:
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0.*
-- gtest>=1.13.0.*
+- gmock>=1.13.0
+- gtest>=1.13.0
 - hypothesis
 - ipython
 - libarrow==11.0.0.*
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index a236b62c5fd..275b8f9332f 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -396,6 +396,8 @@ outputs:
         - {{ pin_subpackage('libcudf', exact=True) }}
         - {{ pin_subpackage('libcudf_kafka', exact=True) }}
         - cudatoolkit {{ cuda_spec }}
+        - gtest {{ gtest_version }}
+        - gmock {{ gtest_version }}
         - libcurand {{ libcurand_run_version }}
     about:
       home: https://rapids.ai/
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 565a396d913..c3db3370c62 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -153,8 +153,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
-  STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
-  stream_compaction/unique_count.cpp
+  STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp
+  stream_compaction/unique.cpp stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
@@ -195,8 +195,7 @@ ConfigureBench(
   reduction/reduce.cpp reduction/scan.cpp
 )
 ConfigureNVBench(
-  REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/scan_structs.cpp
-  reduction/segmented_reduce.cpp
+  REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
@@ -280,7 +279,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/case.cpp
   string/combine.cpp
   string/contains.cpp
   string/convert_datetime.cpp
@@ -301,7 +299,10 @@ ConfigureBench(
   string/url_decode.cu
 )
 
-ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)
+ConfigureNVBench(
+  STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp
+  string/reverse.cpp
+)
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/reduction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp
similarity index 75%
rename from cpp/benchmarks/reduction/distinct_count.cpp
rename to cpp/benchmarks/stream_compaction/distinct_count.cpp
index d2218c270a8..2b2c901b90f 100644
--- a/cpp/benchmarks/reduction/distinct_count.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp
@@ -15,14 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/stream_compaction.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
-static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_list<Type>)
+static void bench_distinct_count(nvbench::state& state, nvbench::type_list<Type>)
 {
   auto const dtype            = cudf::type_to_id<Type>();
   auto const size             = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -40,16 +40,19 @@ static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_
   auto const& data_column = data_table->get_column(0);
   auto const input_table  = cudf::table_view{{data_column, data_column, data_column}};
 
+  auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    rmm::cuda_stream_view stream{launch.get_stream()};
-    cudf::detail::distinct_count(input_table, cudf::null_equality::EQUAL, stream);
+    cudf::distinct_count(input_table, cudf::null_equality::EQUAL);
   });
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 using data_type = nvbench::type_list<int32_t, int64_t, float, double>;
 
-NVBENCH_BENCH_TYPES(bench_reduction_distinct_count, NVBENCH_TYPE_AXES(data_type))
-  .set_name("reduction_distinct_count")
+NVBENCH_BENCH_TYPES(bench_distinct_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("distinct_count")
   .add_int64_axis("num_rows",
                   {
                     10000,      // 10k
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 6152ea741a3..0cdd5fbac32 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -15,36 +15,64 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCase : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_case(benchmark::State& state)
+void bench_case(nvbench::state& state)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows});
-  cudf::strings_column_view input(column->view());
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("width"));
+  auto const encoding  = state.get_string("encoding");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::to_lower(input);
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+
+  auto col_view = column->view();
+
+  cudf::column::contents ascii_contents;
+  if (encoding == "ascii") {
+    data_profile ascii_profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
+    auto input = cudf::strings_column_view(col_view);
+    auto ascii_column =
+      create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
+    auto ascii_data = ascii_column->view();
 
-#define SORT_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCase, name)       \
-  (::benchmark::State & st) { BM_case(st); } \
-  BENCHMARK_REGISTER_F(StringCase, name)     \
-    ->RangeMultiplier(8)                     \
-    ->Ranges({{1 << 12, 1 << 24}})           \
-    ->UseManualTime()                        \
-    ->Unit(benchmark::kMillisecond);
+    col_view = cudf::column_view(col_view.type(),
+                                 col_view.size(),
+                                 nullptr,
+                                 col_view.null_mask(),
+                                 col_view.null_count(),
+                                 0,
+                                 {input.offsets(), ascii_data});
+
+    ascii_contents = ascii_column->release();
+  }
+  auto input = cudf::strings_column_view(col_view);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  state.add_element_count(input.chars_size(), "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
+  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
+}
 
-SORT_BENCHMARK_DEFINE(to_lower)
+NVBENCH_BENCH(bench_case)
+  .set_name("strings_case")
+  .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
new file mode 100644
index 00000000000..8e9e595fcef
--- /dev/null
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_char_types(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const api_type  = state.get_string("api");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto input_types = cudf::strings::string_character_types::SPACE;
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  if (api_type == "all") {
+    state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+  }
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (api_type == "all") {
+      auto result = cudf::strings::all_characters_of_type(input, input_types);
+    } else {
+      auto result = cudf::strings::filter_characters_of_type(input, input_types);
+    }
+  });
+}
+
+NVBENCH_BENCH(bench_char_types)
+  .set_name("char_types")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..d5df222ae37 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -12,6 +12,11 @@
           "file" : "nvbench/use_existing_fmt.diff",
           "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "nvbench/public_fmt_dep_in_conda.diff",
+          "issue" : "Propagate fmt requirement in conda envs [https://github.com/NVIDIA/nvbench/pull/127]",
+          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 921ef5f65f1..cdb8a1c8a09 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -228,6 +228,9 @@ std::unique_ptr<column> empty_like(scalar const& input);
  *
  * Supports only fixed-width types.
  *
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
  * @param[in] input Immutable view of input column to emulate
  * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
@@ -244,6 +247,9 @@ std::unique_ptr<column> allocate_like(
  *
  * Supports only fixed-width types.
  *
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
  * @param[in] input Immutable view of input column to emulate
  * @param[in] size The desired number of elements that the new column should have capacity for
  * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 4c6d19739cf..d9a35470b7d 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -67,7 +67,7 @@ class metadata_builder {
    * @brief Destructor that will be implemented as default, required because metadata_builder_impl
    * is incomplete at this stage.
    */
-  ~metadata_builder() = default;
+  ~metadata_builder();
 
   /**
    * @brief Add a column to this metadata builder.
@@ -105,9 +105,23 @@ class metadata_builder {
    */
   std::vector<uint8_t> build() const;
 
+  /**
+   * @brief Clear the internal buffer containing all added metadata.
+   */
+  void clear();
+
  private:
   std::unique_ptr<metadata_builder_impl> impl;
 };
 
+/**
+ * @copydoc pack_metadata
+ * @param builder The reusable builder object to create packed column metadata.
+ */
+std::vector<uint8_t> pack_metadata(table_view const& table,
+                                   uint8_t const* contiguous_buffer,
+                                   size_t buffer_size,
+                                   metadata_builder& builder);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 360006c1eea..e8bc97e95b3 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -36,6 +36,8 @@ namespace cudf {
  * @brief Returns the null count for a null mask of the specified `state`
  * representing `size` elements.
  *
+ * @throw std::invalid_argument if state is UNINITIALIZED
+ *
  * @param state The state of the null mask
  * @param size The number of elements represented by the mask
  * @return The count of null elements
diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp
index c2c6b3ae83d..10f6e77d889 100644
--- a/cpp/include/cudf_test/iterator_utilities.hpp
+++ b/cpp/include/cudf_test/iterator_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -121,6 +121,9 @@ template <typename Iter>
  * The returned iterator yields `false` (to mark `null`) at the indices corresponding to the
  * pointers having `nullptr` values and `true` for the remaining indices.
  *
+ * @note The input vector is referenced by the transform iterator, so the
+ * lifespan must be just as long as the iterator.
+ *
  * @tparam T the data type
  * @param ptrs The data pointers for which the validity iterator is computed
  * @return auto Validity iterator
@@ -128,8 +131,7 @@ template <typename Iter>
 template <class T>
 [[maybe_unused]] static auto nulls_from_nullptrs(std::vector<T const*> const& ptrs)
 {
-  // The vector `indices` is copied into the lambda as it can be destroyed at the caller site.
-  return thrust::make_transform_iterator(ptrs.begin(), [ptrs](auto ptr) { return ptr != nullptr; });
+  return thrust::make_transform_iterator(ptrs.begin(), [](auto ptr) { return ptr != nullptr; });
 }
 
 }  // namespace iterators
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 6ea4311c8cb..2404cf0d134 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,11 +84,13 @@ std::enable_if_t<cudf::is_fixed_width<TypeParam>() && !cudf::is_timestamp_t<Type
                  thrust::host_vector<TypeParam>>
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
-  thrust::host_vector<TypeParam> vec(init_list.size());
-  std::transform(std::cbegin(init_list), std::cend(init_list), std::begin(vec), [](auto const& e) {
-    if constexpr (std::is_unsigned_v<TypeParam>) { return static_cast<TypeParam>(std::abs(e)); }
-    return static_cast<TypeParam>(e);
-  });
+  std::vector<T> input{init_list};
+  std::vector<TypeParam> vec(init_list.size());
+  std::transform(
+    std::cbegin(input), std::cend(input), std::begin(vec), [](auto const& e) -> TypeParam {
+      if constexpr (std::is_unsigned_v<TypeParam>) { return static_cast<TypeParam>(std::abs(e)); }
+      return static_cast<TypeParam>(e);
+    });
   return vec;
 }
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index daec9b5b199..8482769b569 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -217,6 +217,8 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
 
   auto out_view = out->mutable_view();
   cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
+  // TODO: consider having the binary_operation count nulls instead
+  out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream));
   return out;
 }
 }  // namespace compiled
@@ -373,6 +375,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   auto out_view = out->mutable_view();
   binops::jit::binary_operation(out_view, lhs, rhs, ptx, stream);
+  out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream));
   return out;
 }
 }  // namespace detail
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index b98a2196748..4c22988900b 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,10 +50,9 @@ size_type state_null_count(mask_state state, size_type size)
 {
   switch (state) {
     case mask_state::UNALLOCATED: return 0;
-    case mask_state::UNINITIALIZED: return UNKNOWN_NULL_COUNT;
     case mask_state::ALL_NULL: return size;
     case mask_state::ALL_VALID: return 0;
-    default: CUDF_FAIL("Invalid null mask state.");
+    default: CUDF_FAIL("Invalid null mask state.", std::invalid_argument);
   }
 }
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 5f455e26e52..e147b12ad99 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,12 +81,13 @@ std::unique_ptr<column> make_numeric_column(data_type type,
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of numeric elements
@@ -100,12 +101,13 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of timestamp elements
@@ -119,12 +121,13 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of duration elements
@@ -138,12 +141,13 @@ std::unique_ptr<column> make_duration_column(data_type type,
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of fixed width elements
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 4c3b4eddb8d..e7ac424001c 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1251,6 +1251,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   std::vector<column_view> cols;
   cols.reserve(num_root_columns);
   auto cur_dst_buf_info = h_dst_buf_info;
+  cudf::detail::metadata_builder meta_builder(num_root_columns);
+
   for (std::size_t idx = 0; idx < num_partitions; idx++) {
     // traverse the buffers and build the columns.
     cur_dst_buf_info = build_output_columns(
@@ -1258,14 +1260,18 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
     // pack the columns
     cudf::table_view t{cols};
-    result.push_back(packed_table{
-      t,
-      packed_columns{
-        std::make_unique<std::vector<uint8_t>>(cudf::pack_metadata(
-          t, reinterpret_cast<uint8_t const*>(out_buffers[idx].data()), out_buffers[idx].size())),
-        std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))}});
-
     cols.clear();
+
+    cudf::packed_columns packed_cols{
+      std::make_unique<std::vector<uint8_t>>(
+        cudf::detail::pack_metadata(t,
+                                    reinterpret_cast<uint8_t const*>(out_buffers[idx].data()),
+                                    out_buffers[idx].size(),
+                                    meta_builder)),
+      std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))};
+    meta_builder.clear();
+
+    result.emplace_back(packed_table{std::move(t), std::move(packed_cols)});
   }
   return result;
 }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 2bcfeae20c4..9d4b02ffb4f 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -129,7 +129,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                   size,
                                   rmm::device_buffer(size * size_of(input.type()), stream, mr),
                                   detail::create_null_mask(size, allocate_mask, stream, mr),
-                                  state_null_count(allocate_mask, input.size()));
+                                  0);
 }
 
 }  // namespace detail
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 9a506c866bc..64599716765 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,6 +108,7 @@ struct out_of_place_copy_range_dispatch {
     if (source_end != source_begin) {  // otherwise no-op
       auto ret_view = p_ret->mutable_view();
       in_place_copy_range<T>(source, ret_view, source_begin, source_end, target_begin, stream);
+      p_ret->set_null_count(ret_view.null_count());
     }
 
     return p_ret;
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index bac9aac1886..e4de4a43b68 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -35,6 +35,8 @@ namespace {
  * and unpack.
  */
 struct serialized_column {
+  serialized_column() = default;
+
   serialized_column(data_type _type,
                     size_type _size,
                     size_type _null_count,
@@ -150,24 +152,22 @@ packed_columns pack(cudf::table_view const& input,
   return contig_split_result.empty() ? packed_columns{} : std::move(contig_split_result[0].data);
 }
 
-template <typename ColumnIter>
-std::vector<uint8_t> pack_metadata(ColumnIter begin,
-                                   ColumnIter end,
+std::vector<uint8_t> pack_metadata(table_view const& table,
                                    uint8_t const* contiguous_buffer,
-                                   size_t buffer_size)
+                                   size_t buffer_size,
+                                   metadata_builder& builder)
 {
-  auto mb = metadata_builder(std::distance(begin, end));
-
-  std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) {
-    build_column_metadata(mb, col, contiguous_buffer, buffer_size);
-  });
+  std::for_each(
+    table.begin(), table.end(), [&builder, contiguous_buffer, buffer_size](column_view const& col) {
+      build_column_metadata(builder, col, contiguous_buffer, buffer_size);
+    });
 
-  return mb.build();
+  return builder.build();
 }
 
 class metadata_builder_impl {
  public:
-  metadata_builder_impl() = default;
+  metadata_builder_impl(size_type const num_root_columns) { metadata.reserve(num_root_columns); }
 
   void add_column_info_to_meta(data_type const col_type,
                                size_type const col_size,
@@ -182,14 +182,16 @@ class metadata_builder_impl {
 
   std::vector<uint8_t> build() const
   {
-    // convert to anonymous bytes
-    std::vector<uint8_t> metadata_bytes;
-    auto const metadata_begin = reinterpret_cast<uint8_t const*>(metadata.data());
-    std::copy(metadata_begin,
-              metadata_begin + (metadata.size() * sizeof(detail::serialized_column)),
-              std::back_inserter(metadata_bytes));
-
-    return metadata_bytes;
+    auto output = std::vector<uint8_t>(metadata.size() * sizeof(detail::serialized_column));
+    std::memcpy(output.data(), metadata.data(), output.size());
+    return output;
+  }
+
+  void clear()
+  {
+    // Clear all, except the first metadata entry storing the number of top level columns that
+    // was added upon object construction.
+    metadata.resize(1);
   }
 
  private:
@@ -228,13 +230,16 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
 }
 
 metadata_builder::metadata_builder(size_type const num_root_columns)
-  : impl(std::make_unique<metadata_builder_impl>())
+  : impl(std::make_unique<metadata_builder_impl>(num_root_columns +
+                                                 1 /*one more extra metadata entry as below*/))
 {
   // first metadata entry is a stub indicating how many total (top level) columns
   // there are
   impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0);
 }
 
+metadata_builder::~metadata_builder() = default;
+
 void metadata_builder::add_column_info_to_meta(data_type const col_type,
                                                size_type const col_size,
                                                size_type const col_null_count,
@@ -248,6 +253,8 @@ void metadata_builder::add_column_info_to_meta(data_type const col_type,
 
 std::vector<uint8_t> metadata_builder::build() const { return impl->build(); }
 
+void metadata_builder::clear() { return impl->clear(); }
+
 }  // namespace detail
 
 /**
@@ -267,9 +274,10 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
                                    size_t buffer_size)
 {
   CUDF_FUNC_RANGE();
-  return table.is_empty()
-           ? std::vector<uint8_t>{}
-           : detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size);
+  if (table.is_empty()) { return std::vector<uint8_t>{}; }
+
+  auto builder = cudf::detail::metadata_builder(table.num_columns());
+  return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder);
 }
 
 /**
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index a747cc195ae..342392c773e 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -122,6 +122,7 @@ struct out_of_place_fill_range_dispatch {
       auto ret_view    = p_ret->mutable_view();
       using DeviceType = cudf::device_storage_type_t<T>;
       in_place_fill<DeviceType>(ret_view, begin, end, value, stream);
+      p_ret->set_null_count(ret_view.null_count());
     }
 
     return p_ret;
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 6312fbf93a9..3c8524588af 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -27,8 +27,9 @@ template <>
 uint64_t container::get_encoded()
 {
   uint64_t val = 0;
-  for (uint64_t len = 0; len < 64; len += 7) {
-    auto const byte = get_raw<uint8_t>();
+  for (auto len = 0; len < 64; len += 7) {
+    // 64-bit int since shift left is upto 64.
+    uint64_t const byte = get_raw<uint8_t>();
     val |= (byte & 0x7f) << len;
     if (byte < 0x80) break;
   }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 65c93105304..0fdcd33ada4 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -813,10 +813,10 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 
       // For string columns return ["offsets", "char"] schema
       if (target_type.id() == type_id::STRING) {
-        return {std::move(col), {{"offsets"}, {"chars"}}};
+        return {std::move(col), std::vector<column_name_info>{{"offsets"}, {"chars"}}};
       }
       // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema
-      return {std::move(col), {}};
+      return {std::move(col), std::vector<column_name_info>{}};
     }
     case json_col_t::StructColumn: {
       std::vector<std::unique_ptr<column>> child_columns;
@@ -860,7 +860,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                        data_type{type_id::INT8},
                                                        0,
                                                        rmm::device_buffer{0, stream, mr}),
-                                                     {}}
+                                                     std::vector<column_name_info>{}}
           : device_json_column_to_cudf_column(
               json_col.child_columns.begin()->second,
               d_input,
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index c29bf2f8866..d8ca0411910 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1678,11 +1678,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
       // For string columns return ["offsets", "char"] schema
       if (target_type.id() == type_id::STRING) {
-        return {std::move(col), {{"offsets"}, {"chars"}}};
+        return {std::move(col), std::vector<column_name_info>{{"offsets"}, {"chars"}}};
       }
       // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema
       else {
-        return {std::move(col), {}};
+        return {std::move(col), std::vector<column_name_info>{}};
       }
       break;
     }
@@ -1724,7 +1724,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto [child_column, names] =
         json_col.child_columns.empty()
           ? std::pair<std::unique_ptr<column>,
-                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+                      std::vector<column_name_info>>{std::make_unique<column>(),
+                                                     std::vector<column_name_info>{}}
           : json_column_to_cudf_column(json_col.child_columns.begin()->second,
                                        d_input,
                                        options,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index fbe44eff5ad..1561737da48 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -102,27 +102,18 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
   return type_id::EMPTY;
 }
 
-constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
-  const orc::StreamKind kind, uint32_t skip_count, bool non_child)
+gpu::StreamIndexType get_stream_index_type(orc::StreamKind kind)
 {
   switch (kind) {
-    case orc::DATA:
-      skip_count += 1;
-      skip_count |= (skip_count & 0xff) << 8;
-      return std::pair(gpu::CI_DATA, skip_count);
+    case orc::DATA: return gpu::CI_DATA;
     case orc::LENGTH:
-    case orc::SECONDARY:
-      skip_count += 1;
-      skip_count |= (skip_count & 0xff) << 16;
-      return std::pair(gpu::CI_DATA2, skip_count);
-    case orc::DICTIONARY_DATA: return std::pair(gpu::CI_DICTIONARY, skip_count);
-    case orc::PRESENT:
-      skip_count += (non_child ? 1 : 0);
-      return std::pair(gpu::CI_PRESENT, skip_count);
-    case orc::ROW_INDEX: return std::pair(gpu::CI_INDEX, skip_count);
+    case orc::SECONDARY: return gpu::CI_DATA2;
+    case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+    case orc::PRESENT: return gpu::CI_PRESENT;
+    case orc::ROW_INDEX: return gpu::CI_INDEX;
     default:
       // Skip this stream as it's not strictly required
-      return std::pair(gpu::CI_NUM_STREAMS, 0);
+      return gpu::CI_NUM_STREAMS;
   }
 }
 
@@ -213,16 +204,15 @@ size_t gather_stream_info(const size_t stripe_index,
     }
     if (col != -1) {
       if (src_offset >= stripeinfo->indexLength || use_index) {
-        // NOTE: skip_count field is temporarily used to track index ordering
-        auto& chunk = chunks[stripe_index][col];
-        const auto idx =
-          get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]);
-        if (idx.first < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[idx.first]  = stream_info.size();
-          chunk.strm_len[idx.first] = stream.length;
-          chunk.skip_count          = idx.second;
-
-          if (idx.first == gpu::CI_DICTIONARY) {
+        auto& chunk           = chunks[stripe_index][col];
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          chunk.strm_id[index_type]  = stream_info.size();
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
             chunk.dictionary_start = *num_dictionary_entries;
             chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
             *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 11813677b95..6c0f0767b73 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -21,6 +21,8 @@
 
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 
 namespace cudf {
 namespace io {
@@ -226,6 +228,30 @@ enum row_entry_state_e {
   STORE_INDEX2,
 };
 
+/**
+ * @brief Calculates the order of index streams based on the index types present in the column.
+ *
+ * @param index_types_bitmap The bitmap of index types showing which index streams are present
+ *
+ * @return The order of index streams
+ */
+static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap)
+{
+  constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
+
+  std::array<uint32_t, full_order.size()> partial_order;
+  thrust::copy_if(thrust::seq,
+                  full_order.cbegin(),
+                  full_order.cend(),
+                  partial_order.begin(),
+                  [index_types_bitmap] __device__(auto index_type) {
+                    // Check if the index type is present
+                    return index_types_bitmap & (1 << index_type);
+                  });
+
+  return partial_order;
+}
+
 /**
  * @brief Decode a single row group index entry
  *
@@ -239,11 +265,14 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
                                                       uint8_t const* const end)
 {
   constexpr uint32_t pb_rowindexentry_id = ProtofType::FIXEDLEN + 8;
+  auto const stream_order                = index_order_from_index_types(s->chunk.skip_count);
 
   const uint8_t* cur      = start;
   row_entry_state_e state = NOT_FOUND;
-  uint32_t length = 0, strm_idx_id = s->chunk.skip_count >> 8, idx_id = 1, ci_id = CI_PRESENT,
-           pos_end = 0;
+  uint32_t length         = 0;
+  uint32_t idx_id         = 0;
+  uint32_t pos_end        = 0;
+  uint32_t ci_id          = CI_NUM_STREAMS;
   while (cur < end) {
     uint32_t v = 0;
     for (uint32_t l = 0; l <= 28; l += 7) {
@@ -283,10 +312,8 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
         }
         break;
       case STORE_INDEX0:
-        ci_id = (idx_id == (strm_idx_id & 0xff))          ? CI_DATA
-                : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2
-                                                          : CI_PRESENT;
-        idx_id++;
+        // Start of a new entry; determine the stream index types
+        ci_id = stream_order[idx_id++];
         if (s->is_compressed) {
           if (ci_id < CI_PRESENT) s->row_index_entry[0][ci_id] = v;
           if (cur >= start + pos_end) return length;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 02c4532bb79..0997983c95e 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -38,31 +39,33 @@ namespace detail {
 namespace {
 
 /**
- * @brief Per string logic for case conversion functions.
+ * @brief Threshold to decide on using string or warp parallel functions.
  *
+ * If the average byte length of a string in a column exceeds this value then
+ * the warp-parallel function is used to compute the output sizes.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the strings_lengths benchmark results.
  */
-struct upper_lower_fn {
-  const column_device_view d_column;
-  character_flags_table_type case_flag;  // flag to check with on each character
-  const character_flags_table_type* d_flags;
-  const character_cases_table_type* d_case_table;
-  const special_case_mapping* d_special_case_mapping;
-  int32_t* d_offsets{};
-  char* d_chars{};
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 
-  __device__ special_case_mapping get_special_case_mapping(uint32_t code_point)
-  {
-    return d_special_case_mapping[get_special_case_hash_index(code_point)];
-  }
+/**
+ * @brief Utility functions for converting characters to upper or lower case
+ */
+struct convert_char_fn {
+  character_flags_table_type case_flag;
+  character_flags_table_type const* d_flags;
+  character_cases_table_type const* d_case_table;
+  special_case_mapping const* d_special_case_mapping;
 
-  // compute-size / copy the bytes representing the special case mapping for this codepoint
-  __device__ int32_t handle_special_case_bytes(uint32_t code_point,
-                                               char* d_buffer,
-                                               detail::character_flags_table_type flag)
+  // compute size or copy the bytes representing the special case mapping for this codepoint
+  __device__ size_type handle_special_case_bytes(uint32_t code_point,
+                                                 detail::character_flags_table_type flag,
+                                                 char* d_buffer = nullptr) const
   {
-    special_case_mapping m = get_special_case_mapping(code_point);
-    size_type bytes        = 0;
+    special_case_mapping m = d_special_case_mapping[get_special_case_hash_index(code_point)];
 
+    size_type bytes   = 0;
     auto const count  = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars;
     auto const* chars = IS_LOWER(flag) ? m.upper : m.lower;
     for (uint16_t idx = 0; idx < count; idx++) {
@@ -73,76 +76,200 @@ struct upper_lower_fn {
     return bytes;
   }
 
-  __device__ void operator()(size_type idx)
+  // this is called for converting any UTF-8 characters
+  __device__ size_type process_character(char_utf8 chr, char* d_buffer = nullptr) const
+  {
+    auto const code_point = detail::utf8_to_codepoint(chr);
+
+    detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
+
+    // we apply special mapping in two cases:
+    // - uncased characters with the special mapping flag: always
+    // - cased characters with the special mapping flag: when matching the input case_flag
+    if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) {
+      return handle_special_case_bytes(code_point, case_flag, d_buffer);
+    }
+
+    char_utf8 const new_char =
+      (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : chr;
+    return (d_buffer) ? detail::from_char_utf8(new_char, d_buffer)
+                      : detail::bytes_in_char_utf8(new_char);
+  }
+
+  // special function for converting ASCII-only characters
+  __device__ char process_ascii(char chr)
   {
-    if (d_column.is_null(idx)) {
+    return (case_flag & d_flags[chr]) ? static_cast<char>(d_case_table[chr]) : chr;
+  }
+};
+
+/**
+ * @brief Per string logic for case conversion functions
+ *
+ * This can be used in calls to make_strings_children.
+ */
+struct upper_lower_fn {
+  convert_char_fn converter;
+  column_device_view d_strings;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx) const
+  {
+    if (d_strings.is_null(idx)) {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    auto const d_str = d_column.template element<string_view>(idx);
-    int32_t bytes    = 0;
+    auto const d_str = d_strings.element<string_view>(idx);
+    size_type bytes  = 0;
     char* d_buffer   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      uint32_t code_point = detail::utf8_to_codepoint(*itr);
-
-      detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
-
-      // we apply special mapping in two cases:
-      // - uncased characters with the special mapping flag, always
-      // - cased characters with the special mapping flag, when matching the input case_flag
-      //
-      if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) {
-        auto const new_bytes = handle_special_case_bytes(code_point, d_buffer, case_flag);
-        bytes += new_bytes;
-        if (d_buffer) d_buffer += new_bytes;
+      auto const size = converter.process_character(*itr, d_buffer);
+      if (d_buffer) {
+        d_buffer += size;
       } else {
-        char_utf8 new_char =
-          (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : *itr;
-        if (!d_buffer)
-          bytes += detail::bytes_in_char_utf8(new_char);
-        else
-          d_buffer += detail::from_char_utf8(new_char, d_buffer);
+        bytes += size;
       }
     }
-    if (!d_buffer) d_offsets[idx] = bytes;
+    if (!d_buffer) { d_offsets[idx] = bytes; }
+  }
+};
+
+/**
+ * @brief Count output bytes in warp-parallel threads
+ *
+ * This executes as one warp per string and just computes the output sizes.
+ */
+struct count_bytes_fn {
+  convert_char_fn converter;
+  column_device_view d_strings;
+  size_type* d_offsets;
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const str_idx  = idx / cudf::detail::warp_size;
+    auto const lane_idx = idx % cudf::detail::warp_size;
+
+    // initialize the output for the atomicAdd
+    if (lane_idx == 0) { d_offsets[str_idx] = 0; }
+    __syncwarp();
+
+    if (d_strings.is_null(str_idx)) { return; }
+    auto const d_str   = d_strings.element<string_view>(str_idx);
+    auto const str_ptr = d_str.data();
+
+    size_type size = 0;
+    for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
+      auto const chr = str_ptr[i];
+      if (is_utf8_continuation_char(chr)) { continue; }
+      char_utf8 u8 = 0;
+      to_char_utf8(str_ptr + i, u8);
+      size += converter.process_character(u8);
+    }
+    // this is every so slightly faster than using the cub::warp_reduce
+    if (size > 0) atomicAdd(d_offsets + str_idx, size);
   }
 };
 
+/**
+ * @brief Special functor for processing ASCII-only data
+ */
+struct ascii_converter_fn {
+  convert_char_fn converter;
+  __device__ char operator()(char chr) { return converter.process_ascii(chr); }
+};
+
 /**
  * @brief Utility method for converting upper and lower case characters
- * in a strings column.
+ * in a strings column
  *
- * @param strings Strings to convert.
+ * @param input Strings to convert
  * @param case_flag The character type to convert (upper, lower, or both)
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with characters converted.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with characters converted
  */
-std::unique_ptr<column> convert_case(strings_column_view const& strings,
+std::unique_ptr<column> convert_case(strings_column_view const& input,
                                      character_flags_table_type case_flag,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  // build functor with lookup tables used for case conversion
-  upper_lower_fn functor{d_column,
-                         case_flag,
-                         get_character_flags_table(),
-                         get_character_cases_table(),
-                         get_special_case_mapping_table()};
-
-  // this utility calls the functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(functor, strings.size(), stream, mr);
-
-  return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+  if (input.size() == input.null_count()) {
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_flags   = get_character_flags_table();
+  auto const d_cases   = get_character_cases_table();
+  auto const d_special = get_special_case_mapping_table();
+
+  convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
+  upper_lower_fn converter{ccfn, *d_strings};
+
+  // For smaller strings, use the regular string-parallel algorithm
+  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    auto [offsets, chars] =
+      cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
+    return make_strings_column(input.size(),
+                               std::move(offsets),
+                               std::move(chars),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+
+  // Check if the input contains any multi-byte characters.
+  // This check incurs ~20% performance hit for smaller strings and so we only use it
+  // after the threshold check above. The check makes very little impact for larger strings
+  // but results in a large performance gain when the input contains only single-byte characters.
+  // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
+  bool const multi_byte_chars =
+    thrust::count_if(
+      rmm::exec_policy(stream), input.chars_begin(), input.chars_end(), [] __device__(auto chr) {
+        return is_utf8_continuation_char(chr);
+      }) > 0;
+  if (!multi_byte_chars) {
+    // optimization for ASCII-only case: copy the input column and inplace replace each character
+    auto result = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars =
+      result->mutable_view().child(strings_column_view::chars_column_index).data<char>();
+    auto const chars_size = strings_column_view(result->view()).chars_size();
+    thrust::transform(
+      rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
+    result->set_null_count(input.null_count());
+    return result;
+  }
+
+  // This will use a warp-parallel algorithm to compute the output sizes for each string
+  // and then uses the normal string parallel functor to build the output.
+  auto offsets = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<size_type>();
+
+  // first pass, compute output sizes
+  // note: tried to use segmented-reduce approach instead here and it was consistently slower
+  count_bytes_fn counter{ccfn, *d_strings, d_offsets};
+  auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+  thrust::for_each_n(
+    rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);
+
+  // convert sizes to offsets
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Size of output exceeds column size limit",
+               std::overflow_error);
+
+  auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  // second pass, write output
+  converter.d_offsets = d_offsets;
+  converter.d_chars   = chars->mutable_view().data<char>();
+  thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
+
+  return make_strings_column(input.size(),
+                             std::move(offsets),
+                             std::move(chars),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index a403061ba0e..b87fb80fcc2 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -31,59 +31,84 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-//
-std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+namespace {
+
+/**
+ * @brief Returns true for each string where all characters match the given types.
+ *
+ * Only the characters that match to `verify_types` are checked.
+ * Returns false if no characters are checked or one character does not match `types`.
+ * Returns true if at least one character is checked and all checked characters match `types`.
+ */
+struct char_types_fn {
+  column_device_view const d_column;
+  character_flags_table_type const* d_flags;
+  string_character_types const types;
+  string_character_types const verify_types;
+
+  __device__ bool operator()(size_type idx) const
+  {
+    if (d_column.is_null(idx)) { return false; }
+    auto const d_str = d_column.element<string_view>(idx);
+    auto const end   = d_str.data() + d_str.size_bytes();
+
+    bool type_matched     = !d_str.empty();  // require at least one character;
+    size_type check_count = 0;               // count checked characters
+    for (auto itr = d_str.data(); type_matched && (itr < end); ++itr) {
+      uint8_t const chr = static_cast<uint8_t>(*itr);
+      if (is_utf8_continuation_char(chr)) { continue; }
+      auto u8 = static_cast<char_utf8>(chr);  // holds UTF8 value
+      // using max(int8) here since max(char)=255 on ARM systems
+      if (u8 > std::numeric_limits<int8_t>::max()) { to_char_utf8(itr, u8); }
+
+      // lookup flags in table by codepoint
+      auto const code_point = utf8_to_codepoint(u8);
+      auto const flag       = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
+
+      if ((verify_types & flag) ||                   // should flag be verified;
+          (flag == 0 && verify_types == ALL_TYPES))  // special edge case
+      {
+        type_matched = (types & flag) > 0;
+        ++check_count;
+      }
+    }
+
+    return type_matched && (check_count > 0);
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+  auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
-  auto results_view = results->mutable_view();
-  auto d_results    = results_view.data<bool>();
   // get the static character types table
   auto d_flags = detail::get_character_flags_table();
+
   // set the output values by checking the character types for each string
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
-                    d_results,
-                    [d_column, d_flags, types, verify_types, d_results] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      auto d_str            = d_column.element<string_view>(idx);
-                      bool check            = !d_str.empty();  // require at least one character
-                      size_type check_count = 0;
-                      for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) {
-                        auto code_point = detail::utf8_to_codepoint(*itr);
-                        // lookup flags in table by code-point
-                        auto flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
-                        if ((verify_types & flag) ||                   // should flag be verified
-                            (flag == 0 && verify_types == ALL_TYPES))  // special edge case
-                        {
-                          check = (types & flag) > 0;
-                          ++check_count;
-                        }
-                      }
-                      return check && (check_count > 0);
-                    });
-  //
-  results->set_null_count(strings.null_count());
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    results->mutable_view().data<bool>(),
+                    char_types_fn{*d_strings, d_flags, types, verify_types});
+
+  results->set_null_count(input.null_count());
   return results;
 }
 
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index a4bbb213930..4693fc8e342 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -37,6 +37,7 @@ TEST_F(BitmaskUtilitiesTest, StateNullCount)
   EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42));
   EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42));
   EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42));
+  EXPECT_THROW(cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42), std::invalid_argument);
 }
 
 TEST_F(BitmaskUtilitiesTest, BitmaskAllocationSize)
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index ea23a1c1069..a196f2ec980 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -75,6 +75,7 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -158,6 +159,7 @@ TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -213,6 +215,7 @@ TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -267,6 +270,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -332,6 +336,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
 
   // Set strings nullmask, post construction.
   cudf::set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
+  strings->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*strings));
 
@@ -358,6 +363,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
   // Set lists nullmask, post construction.
   cudf::detail::set_null_mask(
     lists->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  lists->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
 
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9bf16ecca6b..cd067cce928 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1568,7 +1568,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
 
   auto make_validity = [](std::vector<int> const& validity) {
     return cudf::detail::make_counting_transform_iterator(
-      0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
+      0, [&](auto i) -> bool { return static_cast<bool>(validity[i]); });
   };
 
   constexpr int int_ignore{};
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index 9c70fc364c7..d2f2b5d6a2e 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -33,7 +33,7 @@ TYPED_TEST_SUITE(TypedScalarTestWithoutFixedPoint, cudf::test::FixedWidthTypesWi
 TYPED_TEST(TypedScalarTest, DefaultValidity)
 {
   using Type = cudf::device_storage_type_t<TypeParam>;
-  Type value = cudf::test::make_type_param_scalar<TypeParam>(7);
+  Type value = static_cast<Type>(cudf::test::make_type_param_scalar<TypeParam>(7));
   cudf::scalar_type_t<TypeParam> s(value);
 
   EXPECT_TRUE(s.is_valid());
@@ -71,7 +71,7 @@ TYPED_TEST(TypedScalarTestWithoutFixedPoint, SetNull)
 TYPED_TEST(TypedScalarTest, CopyConstructor)
 {
   using Type = cudf::device_storage_type_t<TypeParam>;
-  Type value = cudf::test::make_type_param_scalar<TypeParam>(8);
+  Type value = static_cast<Type>(cudf::test::make_type_param_scalar<TypeParam>(8));
   cudf::scalar_type_t<TypeParam> s(value);
   auto s2 = s;
 
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 3852930dafe..31637a6ab9a 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -202,6 +202,64 @@ TEST_F(StringsCaseTest, MultiCharLower)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsCaseTest, Ascii)
+{
+  // triggering the ascii code path requires some long-ish strings
+  cudf::test::strings_column_wrapper input{
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
+  auto view     = cudf::strings_column_view(input);
+  auto expected = cudf::test::strings_column_wrapper{
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
+  auto results = cudf::strings::to_lower(view);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::strings_column_wrapper{
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"};
+  results = cudf::strings::to_upper(view);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
+}
+
+TEST_F(StringsCaseTest, LongStrings)
+{
+  // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
+  cudf::test::strings_column_wrapper input{
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
+  auto view     = cudf::strings_column_view(input);
+  auto expected = cudf::test::strings_column_wrapper{
+    "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
+  auto results = cudf::strings::to_lower(view);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::strings_column_wrapper{
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
+    "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"};
+  results = cudf::strings::to_upper(view);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
+}
+
 TEST_F(StringsCaseTest, EmptyStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
diff --git a/dependencies.yaml b/dependencies.yaml
index 00ec63d31ef..70d7f8c1ec8 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,8 +213,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
-          - &gtest gtest>=1.13.0.*
-          - &gmock gmock>=1.13.0.*
+          - &gtest gtest>=1.13.0
+          - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow==11.0.0.*
diff --git a/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java b/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java
index cdb7e9e4418..fa96e833b90 100644
--- a/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java
+++ b/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ private static void shiftSrcLeftAndWriteToDst(HostMemoryBuffer src, HostMemoryBu
   /**
    * This method returns the length in bytes needed to represent X number of rows
    * e.g. getValidityLengthInBytes(5) => 1 byte
-   * getLengthInBytes(7) => 1 byte
+   * getValidityLengthInBytes(7) => 1 byte
    * getValidityLengthInBytes(14) => 2 bytes
    */
   static long getValidityLengthInBytes(long rows) {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 4d43ffcb457..fecb13e1921 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -119,7 +119,10 @@ public ColumnVector(DType type, long rows, Optional<Long> nullCount,
     incRefCountInternal(true);
   }
 
-  private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
+  /**
+   * This method is internal and exposed purely for testing purposes
+   */
+  static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
       DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
       DeviceMemoryBuffer offsetBuffer, List<DeviceMemoryBuffer> toClose, long[] childHandles) {
     long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
@@ -141,7 +144,7 @@ private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nu
    * @param offsetBuffer a host buffer required for strings and string categories. The column
    *                    vector takes ownership of the buffer. Do not use the buffer after calling
    *                    this.
-   * @param toClose  List of buffers to track adn close once done, usually in case of children
+   * @param toClose  List of buffers to track and close once done, usually in case of children
    * @param childHandles array of longs for child column view handles.
    */
   public ColumnVector(DType type, long rows, Optional<Long> nullCount,
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 67ad9166fe0..8b59ea68972 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -43,8 +43,10 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
   protected final ColumnVector.OffHeapState offHeap;
 
   /**
-   * Constructs a Column View given a native view address
+   * Constructs a Column View given a native view address. This asserts that if the ColumnView is
+   * of nested-type it doesn't contain non-empty nulls
    * @param address the view handle
+   * @throws AssertionError if the address points to a nested-type view with non-empty nulls
    */
   ColumnView(long address) {
     this.viewHandle = address;
@@ -52,15 +54,24 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
     this.rows = ColumnView.getNativeRowCount(viewHandle);
     this.nullCount = ColumnView.getNativeNullCount(viewHandle);
     this.offHeap = null;
-    AssertEmptyNulls.assertNullsAreEmpty(this);
+    try {
+      AssertEmptyNulls.assertNullsAreEmpty(this);
+    } catch (AssertionError ae) {
+      // offHeap state is null, so there is nothing to clean in offHeap
+      // delete ColumnView to avoid memory leak
+      deleteColumnView(viewHandle);
+      viewHandle = 0;
+      throw ae;
+    }
   }
 
 
   /**
    * Intended to be called from ColumnVector when it is being constructed. Because state creates a
    * cudf::column_view instance and will close it in all cases, we don't want to have to double
-   * close it.
+   * close it. This asserts that if the offHeapState is of nested-type it doesn't contain non-empty nulls
    * @param state the state this view is based off of.
+   * @throws AssertionError if offHeapState points to a nested-type view with non-empty nulls
    */
   protected ColumnView(ColumnVector.OffHeapState state) {
     offHeap = state;
@@ -68,7 +79,14 @@ protected ColumnView(ColumnVector.OffHeapState state) {
     type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
     rows = ColumnView.getNativeRowCount(viewHandle);
     nullCount = ColumnView.getNativeNullCount(viewHandle);
-    AssertEmptyNulls.assertNullsAreEmpty(this);
+    try {
+      AssertEmptyNulls.assertNullsAreEmpty(this);
+    } catch (AssertionError ae) {
+      // cleanup offHeap
+      offHeap.clean(false);
+      viewHandle = 0;
+      throw ae;
+    }
   }
 
   /**
@@ -649,8 +667,14 @@ public final ColumnVector ifElse(Scalar trueValue, Scalar falseValue) {
   public final ColumnVector[] slice(int... indices) {
     long[] nativeHandles = slice(this.getNativeView(), indices);
     ColumnVector[] columnVectors = new ColumnVector[nativeHandles.length];
-    for (int i = 0; i < nativeHandles.length; i++) {
-      columnVectors[i] = new ColumnVector(nativeHandles[i]);
+    try {
+      for (int i = 0; i < nativeHandles.length; i++) {
+        columnVectors[i] = new ColumnVector(nativeHandles[i]);
+        nativeHandles[i] = 0;
+      }
+    } catch (Throwable t) {
+      cleanupColumnViews(nativeHandles, columnVectors);
+      throw t;
     }
     return columnVectors;
   }
@@ -788,12 +812,31 @@ public final ColumnVector[] split(int... indices) {
   public ColumnView[] splitAsViews(int... indices) {
     long[] nativeHandles = split(this.getNativeView(), indices);
     ColumnView[] columnViews = new ColumnView[nativeHandles.length];
-    for (int i = 0; i < nativeHandles.length; i++) {
-      columnViews[i] = new ColumnView(nativeHandles[i]);
+    try {
+      for (int i = 0; i < nativeHandles.length; i++) {
+        columnViews[i] = new ColumnView(nativeHandles[i]);
+        nativeHandles[i] = 0;
+      }
+    } catch (Throwable t) {
+      cleanupColumnViews(nativeHandles, columnViews);
+      throw t;
     }
     return columnViews;
   }
 
+  static void cleanupColumnViews(long[] nativeHandles, ColumnView[] columnViews) {
+    for (ColumnView columnView: columnViews) {
+      if (columnView != null) {
+        columnView.close();
+      }
+    }
+    for (long nativeHandle: nativeHandles) {
+      if (nativeHandle != 0) {
+        deleteColumnView(nativeHandle);
+      }
+    }
+  }
+
   /**
    * Create a new vector of "normalized" values, where:
    *  1. All representations of NaN (and -NaN) are replaced with the normalized NaN value
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 9abc2dbcd7c..93cb1acfae4 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -87,6 +87,7 @@ public Table(long[] cudfColumns) {
     try {
       for (int i = 0; i < cudfColumns.length; i++) {
         this.columns[i] = new ColumnVector(cudfColumns[i]);
+        cudfColumns[i] = 0;
       }
       long[] views = new long[columns.length];
       for (int i = 0; i < columns.length; i++) {
@@ -95,13 +96,7 @@ public Table(long[] cudfColumns) {
       nativeHandle = createCudfTableView(views);
       this.rows = columns[0].getRowCount();
     } catch (Throwable t) {
-      for (int i = 0; i < cudfColumns.length; i++) {
-        if (this.columns[i] != null) {
-          this.columns[i].close();
-        } else {
-          ColumnVector.deleteCudfColumn(cudfColumns[i]);
-        }
-      }
+      ColumnView.cleanupColumnViews(cudfColumns, this.columns);
       throw t;
     }
   }
@@ -3396,8 +3391,14 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
   public ColumnVector[] convertToRows() {
     long[] ptrs = convertToRows(nativeHandle);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
-    for (int i = 0; i < ptrs.length; i++) {
-      ret[i] = new ColumnVector(ptrs[i]);
+    try {
+      for (int i = 0; i < ptrs.length; i++) {
+        ret[i] = new ColumnVector(ptrs[i]);
+        ptrs[i] = 0;
+      }
+    } catch (Throwable t) {
+      ColumnView.cleanupColumnViews(ptrs, ret);
+      throw t;
     }
     return ret;
   }
@@ -3479,8 +3480,14 @@ public ColumnVector[] convertToRows() {
   public ColumnVector[] convertToRowsFixedWidthOptimized() {
     long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
-    for (int i = 0; i < ptrs.length; i++) {
-      ret[i] = new ColumnVector(ptrs[i]);
+    try {
+      for (int i = 0; i < ptrs.length; i++) {
+        ret[i] = new ColumnVector(ptrs[i]);
+        ptrs[i] = 0;
+      }
+    } catch (Throwable t) {
+      ColumnView.cleanupColumnViews(ptrs, ret);
+      throw t;
     }
     return ret;
   }
@@ -3552,14 +3559,7 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
       }
       result = new Table(columns);
     } catch (Throwable t) {
-      for (int i = 0; i < columns.length; i++) {
-        if (columns[i] != null) {
-          columns[i].close();
-        }
-        if (columnViewAddresses[i] != 0) {
-          ColumnView.deleteColumnView(columnViewAddresses[i]);
-        }
-      }
+      ColumnView.cleanupColumnViews(columnViewAddresses, columns);
       throw t;
     }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 6e9498acdac..59b4c9f9f67 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -32,6 +32,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Supplier;
 import java.util.stream.Collectors;
@@ -6677,6 +6678,54 @@ void testApplyBooleanMaskFromListOfStructure() {
     }
   }
 
+  @Test
+  void testColumnViewWithNonEmptyNullsIsCleared() {
+    List<Integer> list0 = Arrays.asList(1, 2, 3);
+    List<Integer> list1 = Arrays.asList(4, 5, null);
+    List<Integer> list2 = Arrays.asList(7, 8, 9);
+    List<Integer> list3 = null;
+    try (ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3);
+         BaseDeviceMemoryBuffer baseValidityBuffer = input.getDeviceBufferFor(BufferType.VALIDITY);
+         BaseDeviceMemoryBuffer baseOffsetBuffer = input.getDeviceBufferFor(BufferType.OFFSET);
+         HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) {
+
+      newValidity.copyFromDeviceBuffer(baseValidityBuffer);
+      // we are setting list1 with 3 elements to null. This will result in a non-empty null in the
+      // ColumnView at index 1
+      BitVectorHelper.setNullAt(newValidity, 1);
+      // validityBuffer will be closed by offHeapState later
+      DeviceMemoryBuffer validityBuffer = DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4));
+      try {
+        // offsetBuffer will be closed by offHeapState later
+        DeviceMemoryBuffer offsetBuffer = DeviceMemoryBuffer.allocate(baseOffsetBuffer.getLength());
+        try {
+          validityBuffer.copyFromHostBuffer(newValidity);
+          offsetBuffer.copyFromMemoryBuffer(0, baseOffsetBuffer, 0,
+              baseOffsetBuffer.length, Cuda.DEFAULT_STREAM);
+
+          // The new offHeapState will have 2 nulls, one null at index 4 from the original ColumnVector
+          // the other at index 1 which is non-empty
+          ColumnVector.OffHeapState offHeapState = ColumnVector.makeOffHeap(input.type, input.rows, Optional.of(2L),
+              null, validityBuffer, offsetBuffer,
+              null, Arrays.stream(input.getChildColumnViews()).mapToLong((c) -> c.viewHandle).toArray());
+          try {
+            new ColumnView(offHeapState);
+          } catch (AssertionError ae) {
+            assert offHeapState.isClean();
+          }
+        } catch (Exception e) {
+          if (!offsetBuffer.closed) {
+            offsetBuffer.close();
+          }
+        }
+      } catch (Exception e) {
+        if (!validityBuffer.closed) {
+          validityBuffer.close();
+        }
+      }
+    }
+  }
+
   @Test
   public void testEventHandlerIsCalledForEachClose() {
     final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
@@ -6700,5 +6749,4 @@ public void testEventHandlerIsNotCalledIfNotSet() {
     }
     assertEquals(0, onClosedWasCalled.get());
   }
-
 }
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index 013cba3ae03..93edef311db 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -4,12 +4,12 @@ from __future__ import annotations
 
 from typing import Dict, Optional, Tuple, TypeVar
 
+from typing_extensions import Self
+
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 
-T = TypeVar("T")
-
 class Column:
     _data: Optional[Buffer]
     _mask: Optional[Buffer]
@@ -29,8 +29,8 @@ class Column:
         size: int,
         dtype: Dtype,
         mask: Optional[Buffer] = None,
-        offset: int = None,
-        null_count: int = None,
+        offset: Optional[int] = None,
+        null_count: Optional[int] = None,
         children: Tuple[ColumnBase, ...] = (),
     ) -> None: ...
     @property
@@ -56,7 +56,7 @@ class Column:
     @property
     def mask_ptr(self) -> int: ...
     def set_base_mask(self, value: Optional[Buffer]) -> None: ...
-    def set_mask(self: T, value: Optional[Buffer]) -> T: ...
+    def set_mask(self, value: Optional[Buffer]) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
@@ -68,7 +68,8 @@ class Column:
     def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ...
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace=False
-    ) -> Optional[ColumnBase]: ...
+    ) -> Optional[Self]: ...
+
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
     def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 68c52a8b9e8..a2e3bc44f3a 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -5,9 +5,10 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import Any, Set, TypeVar
+from typing import Any, Set
 
 import pandas as pd
+from typing_extensions import Self
 
 import cudf
 from cudf._lib.copying import _gather_map_is_valid, gather
@@ -62,8 +63,6 @@
 Float64Index([1.0, 2.0, 3.0], dtype='float64')
 """
 
-BaseIndexT = TypeVar("BaseIndexT", bound="BaseIndex")
-
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
@@ -101,8 +100,8 @@ def __contains__(self, item):
         return item in self._values
 
     def _copy_type_metadata(
-        self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None
-    ) -> BaseIndexT:
+        self, other: Self, *, override_dtypes=None
+    ) -> Self:
         raise NotImplementedError
 
     def get_level_values(self, level):
@@ -1451,7 +1450,6 @@ def get_slice_bound(self, label, side, kind=None):
         raise NotImplementedError
 
     def __array_function__(self, func, types, args, kwargs):
-
         # check if the function is implemented for the current type
         cudf_index_module = type(self)
         for submodule in func.__module__.split(".")[1:]:
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 2262730d8a1..97f3b16bec8 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -5,9 +5,10 @@
 import math
 import pickle
 from types import SimpleNamespace
-from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
 
 import numpy
+from typing_extensions import Self
 
 import rmm
 
@@ -15,8 +16,6 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
-T = TypeVar("T", bound="Buffer")
-
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
@@ -42,7 +41,7 @@ def host_memory_allocation(nbytes: int) -> memoryview:
 def cuda_array_interface_wrapper(
     ptr: int,
     size: int,
-    owner: object = None,
+    owner: Optional[object] = None,
     readonly=False,
     typestr="|u1",
     version=0,
@@ -108,7 +107,7 @@ def __init__(self):
         )
 
     @classmethod
-    def _from_device_memory(cls: Type[T], data: Any) -> T:
+    def _from_device_memory(cls, data: Any) -> Self:
         """Create a Buffer from an object exposing `__cuda_array_interface__`.
 
         No data is being copied.
@@ -139,7 +138,7 @@ def _from_device_memory(cls: Type[T], data: Any) -> T:
         return ret
 
     @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
+    def _from_host_memory(cls, data: Any) -> Self:
         """Create a Buffer from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -310,7 +309,7 @@ def serialize(self) -> Tuple[dict, list]:
         return header, frames
 
     @classmethod
-    def deserialize(cls: Type[T], header: dict, frames: list) -> T:
+    def deserialize(cls, header: dict, frames: list) -> Self:
         """Create an Buffer from a serialized representation.
 
         Parameters
diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py
index 16a1e3942e7..6243916b91b 100644
--- a/python/cudf/cudf/core/buffer/cow_buffer.py
+++ b/python/cudf/cudf/core/buffer/cow_buffer.py
@@ -4,15 +4,15 @@
 
 import weakref
 from collections import defaultdict
-from typing import Any, DefaultDict, Tuple, Type, TypeVar
+from typing import Any, DefaultDict, Tuple
 from weakref import WeakSet
 
+from typing_extensions import Self
+
 import rmm
 
 from cudf.core.buffer.buffer import Buffer
 
-T = TypeVar("T", bound="CopyOnWriteBuffer")
-
 
 def _keys_cleanup(ptr):
     weak_set_values = CopyOnWriteBuffer._instances[ptr]
@@ -55,9 +55,7 @@ def _finalize_init(self):
         weakref.finalize(self, _keys_cleanup, self._ptr)
 
     @classmethod
-    def _from_device_memory(
-        cls: Type[T], data: Any, *, exposed: bool = False
-    ) -> T:
+    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
         """Create a Buffer from an object exposing `__cuda_array_interface__`.
 
         No data is being copied.
@@ -82,7 +80,7 @@ def _from_device_memory(
         return ret
 
     @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
+    def _from_host_memory(cls, data: Any) -> Self:
         ret = super()._from_host_memory(data)
         ret._finalize_init()
         return ret
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index d2a87af3869..7f8399ba522 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -225,7 +225,7 @@ def __init__(
         self,
         *,
         spill_on_demand: bool = False,
-        device_memory_limit: int = None,
+        device_memory_limit: Optional[int] = None,
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
@@ -358,7 +358,7 @@ def spill_device_memory(self, nbytes: int) -> int:
                     buf.lock.release()
         return spilled
 
-    def spill_to_device_limit(self, device_limit: int = None) -> int:
+    def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int:
         """Try to spill device memory until device limit
 
         Notice, by default this is a no-op.
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index c71841a5a26..c1bc49c7a9e 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -7,18 +7,10 @@
 import time
 import weakref
 from threading import RLock
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    Type,
-    TypeVar,
-)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import numpy
+from typing_extensions import Self
 
 import rmm
 
@@ -34,9 +26,6 @@
     from cudf.core.buffer.spill_manager import SpillManager
 
 
-T = TypeVar("T", bound="SpillableBuffer")
-
-
 def get_spillable_owner(data) -> Optional[SpillableBuffer]:
     """Get the spillable owner of `data`, if any exist
 
@@ -212,9 +201,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(
-        cls: Type[T], data: Any, *, exposed: bool = False
-    ) -> T:
+    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -236,7 +223,7 @@ def _from_device_memory(
         return ret
 
     @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
+    def _from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -448,7 +435,9 @@ def __cuda_array_interface__(self) -> dict:
             "version": 0,
         }
 
-    def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview:
+    def memoryview(
+        self, *, offset: int = 0, size: Optional[int] = None
+    ) -> memoryview:
         size = self._size if size is None else size
         with self.lock:
             if self.spillable:
@@ -573,7 +562,9 @@ def deserialize(cls, header: dict, frames: list):
         # copied.
         return SpillableBuffer.deserialize(header, frames)
 
-    def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview:
+    def memoryview(
+        self, *, offset: int = 0, size: Optional[int] = None
+    ) -> memoryview:
         size = self._size if size is None else size
         return self._base.memoryview(offset=self._offset + offset, size=size)
 
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index 2fe332a12fe..85e4762641e 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -16,8 +16,8 @@
 def as_buffer(
     data: Union[int, Any],
     *,
-    size: int = None,
-    owner: object = None,
+    size: Optional[int] = None,
+    owner: Optional[object] = None,
     exposed: bool = False,
 ) -> Buffer:
     """Factory function to wrap `data` in a Buffer object.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 52f7c0b957f..39332807139 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import pyarrow as pa
 from numba import cuda
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -710,13 +711,12 @@ class CategoricalColumn(column.ColumnBase):
     def __init__(
         self,
         dtype: CategoricalDtype,
-        mask: Buffer = None,
-        size: int = None,
+        mask: Optional[Buffer] = None,
+        size: Optional[int] = None,
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
         children: Tuple["column.ColumnBase", ...] = (),
     ):
-
         if size is None:
             for child in children:
                 assert child.offset == 0
@@ -874,7 +874,7 @@ def _fill(
         begin: int,
         end: int,
         inplace: bool = False,
-    ) -> "column.ColumnBase":
+    ) -> Self:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
@@ -889,18 +889,21 @@ def _fill(
         return result
 
     def slice(
-        self, start: int, stop: int, stride: int = None
-    ) -> "column.ColumnBase":
+        self, start: int, stop: int, stride: Optional[int] = None
+    ) -> Self:
         codes = self.codes.slice(start, stop, stride)
-        return cudf.core.column.build_categorical_column(
-            categories=self.categories,
-            codes=cudf.core.column.build_column(
-                codes.base_data, dtype=codes.dtype
+        return cast(
+            Self,
+            cudf.core.column.build_categorical_column(
+                categories=self.categories,
+                codes=cudf.core.column.build_column(
+                    codes.base_data, dtype=codes.dtype
+                ),
+                mask=codes.base_mask,
+                ordered=self.ordered,
+                size=codes.size,
+                offset=codes.offset,
             ),
-            mask=codes.base_mask,
-            ordered=self.ordered,
-            size=codes.size,
-            offset=codes.offset,
         )
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
@@ -962,7 +965,9 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
             " if you need this functionality."
         )
 
-    def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
+    def to_pandas(
+        self, index: Optional[pd.Index] = None, **kwargs
+    ) -> pd.Series:
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
             col = column.build_categorical_column(
@@ -1219,7 +1224,10 @@ def notnull(self) -> ColumnBase:
         return result
 
     def fillna(
-        self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
+        self,
+        fill_value: Any = None,
+        method: Any = None,
+        dtype: Optional[Dtype] = None,
     ) -> CategoricalColumn:
         """
         Fill null values with *fill_value*
@@ -1237,7 +1245,7 @@ def fillna(
                     try:
                         fill_value = self._encode(fill_value)
                         fill_value = self.codes.dtype.type(fill_value)
-                    except (ValueError) as err:
+                    except ValueError as err:
                         err_msg = "fill value must be in categories"
                         raise ValueError(err_msg) from err
             else:
@@ -1351,7 +1359,7 @@ def _get_decategorized_column(self) -> ColumnBase:
         out = out.set_mask(self.mask)
         return out
 
-    def copy(self, deep: bool = True) -> CategoricalColumn:
+    def copy(self, deep: bool = True) -> Self:
         result_col = super().copy(deep=deep)
         if deep:
             result_col.categories = libcudf.copying.copy_column(
@@ -1365,7 +1373,7 @@ def memory_usage(self) -> int:
 
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace: bool = False
-    ) -> Optional[ColumnBase]:
+    ) -> Optional[Self]:
         out = super()._mimic_inplace(other_col, inplace=inplace)
         if inplace and isinstance(other_col, CategoricalColumn):
             self._codes = other_col._codes
@@ -1641,7 +1649,7 @@ def _create_empty_categorical_column(
 
 
 def pandas_categorical_as_column(
-    categorical: ColumnLike, codes: ColumnLike = None
+    categorical: ColumnLike, codes: Optional[ColumnLike] = None
 ) -> CategoricalColumn:
     """Creates a CategoricalColumn from a pandas.Categorical
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 042a1060fae..607bf83ff6c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import pickle
 import warnings
 from functools import cached_property
@@ -15,7 +16,6 @@
     Optional,
     Sequence,
     Tuple,
-    TypeVar,
     Union,
     cast,
 )
@@ -25,6 +25,7 @@
 import pandas as pd
 import pyarrow as pa
 from numba import cuda
+from typing_extensions import Self
 
 import rmm
 
@@ -97,11 +98,6 @@
 else:
     from pandas.core.arrays._arrow_utils import ArrowIntervalType
 
-T = TypeVar("T", bound="ColumnBase")
-# TODO: This workaround allows type hints for `slice`, since `slice` is a
-# method in ColumnBase.
-Slice = TypeVar("Slice", bound=slice)
-
 
 class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
     _VALID_REDUCTIONS = {
@@ -201,7 +197,9 @@ def __repr__(self):
             f"dtype: {self.dtype}"
         )
 
-    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+    def to_pandas(
+        self, index: Optional[pd.Index] = None, **kwargs
+    ) -> pd.Series:
         """Convert object to pandas type.
 
         The default implementation falls back to PyArrow for the conversion.
@@ -244,11 +242,11 @@ def values(self) -> "cupy.ndarray":
         return cupy.asarray(self.data_array_view(mode="write"))
 
     def find_and_replace(
-        self: T,
+        self,
         to_replace: ColumnLike,
         replacement: ColumnLike,
         all_nan: bool = False,
-    ) -> T:
+    ) -> Self:
         raise NotImplementedError
 
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
@@ -397,7 +395,7 @@ def _fill(
         begin: int,
         end: int,
         inplace: bool = False,
-    ) -> Optional[ColumnBase]:
+    ) -> Optional[Self]:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
@@ -437,15 +435,15 @@ def nullmask(self) -> Buffer:
             raise ValueError("Column has no null mask")
         return self.mask_array_view(mode="read")
 
-    def force_deep_copy(self: T) -> T:
+    def force_deep_copy(self) -> Self:
         """
         A method to create deep copy irrespective of whether
         `copy-on-write` is enabled.
         """
         result = libcudf.copying.copy_column(self)
-        return cast(T, result._with_type_metadata(self.dtype))
+        return result._with_type_metadata(self.dtype)
 
-    def copy(self: T, deep: bool = True) -> T:
+    def copy(self, deep: bool = True) -> Self:
         """
         Makes a copy of the Column.
 
@@ -467,7 +465,7 @@ def copy(self: T, deep: bool = True) -> T:
             return self.force_deep_copy()
         else:
             return cast(
-                T,
+                Self,
                 build_column(
                     data=self.base_data
                     if self.base_data is None
@@ -548,7 +546,9 @@ def element_indexing(self, index: int):
 
         return libcudf.copying.get_element(self, idx).value
 
-    def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase:
+    def slice(
+        self, start: int, stop: int, stride: Optional[int] = None
+    ) -> ColumnBase:
         stride = 1 if stride is None else stride
         if start < 0:
             start = start + len(self)
@@ -607,8 +607,10 @@ def _wrap_binop_normalization(self, other):
         return self.normalize_binop_value(other)
 
     def _scatter_by_slice(
-        self, key: Slice, value: Union[cudf.core.scalar.Scalar, ColumnBase]
-    ) -> Optional[ColumnBase]:
+        self,
+        key: builtins.slice,
+        value: Union[cudf.core.scalar.Scalar, ColumnBase],
+    ) -> Optional[Self]:
         """If this function returns None, it's either a no-op (slice is empty),
         or the inplace replacement is already performed (fill-in-place).
         """
@@ -646,7 +648,7 @@ def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
         value: Union[cudf.core.scalar.Scalar, ColumnBase],
-    ) -> ColumnBase:
+    ) -> Self:
         if is_bool_dtype(key.dtype):
             # `key` is boolean mask
             if len(key) != len(self):
@@ -697,11 +699,11 @@ def _check_scatter_key_length(
                 raise ValueError(msg)
 
     def fillna(
-        self: T,
+        self,
         value: Any = None,
-        method: str = None,
-        dtype: Dtype = None,
-    ) -> T:
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
+    ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
@@ -769,8 +771,8 @@ def quantile(
         raise TypeError(f"cannot perform quantile with type {self.dtype}")
 
     def take(
-        self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
-    ) -> T:
+        self, indices: ColumnBase, nullify: bool = False, check_bounds=True
+    ) -> Self:
         """Return Column by taking values from the corresponding *indices*.
 
         Skip bounds checking if check_bounds is False.
@@ -778,7 +780,7 @@ def take(
         """
         # Handle zero size
         if indices.size == 0:
-            return cast(T, column_empty_like(self, newsize=0))
+            return cast(Self, column_empty_like(self, newsize=0))
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
@@ -1097,7 +1099,6 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
     def argsort(
         self, ascending: bool = True, na_position: str = "last"
     ) -> "cudf.core.column.NumericalColumn":
-
         return self.as_frame()._get_sorted_inds(
             ascending=ascending, na_position=na_position
         )
@@ -1244,14 +1245,19 @@ def normalize_binop_value(
     ) -> Union[ColumnBase, ScalarLike]:
         raise NotImplementedError
 
-    def _minmax(self, skipna: bool = None):
+    def _minmax(self, skipna: Optional[bool] = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.minmax(result_col)
         return result_col
 
     def _reduce(
-        self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs
+        self,
+        op: str,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
     ) -> ScalarLike:
         """Compute {op} of column values.
 
@@ -1273,7 +1279,7 @@ def contains_na_entries(self) -> bool:
         return self.null_count != 0
 
     def _process_for_reduction(
-        self, skipna: bool = None, min_count: int = 0
+        self, skipna: Optional[bool] = None, min_count: int = 0
     ) -> Union[ColumnBase, ScalarLike]:
         skipna = True if skipna is None else skipna
 
@@ -1314,8 +1320,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
     def _label_encoding(
         self,
         cats: ColumnBase,
-        dtype: Dtype = None,
-        na_sentinel: ScalarLike = None,
+        dtype: Optional[Dtype] = None,
+        na_sentinel: Optional[ScalarLike] = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1389,9 +1395,9 @@ def _return_sentinel_column():
 
 def column_empty_like(
     column: ColumnBase,
-    dtype: Dtype = None,
+    dtype: Optional[Dtype] = None,
     masked: bool = False,
-    newsize: int = None,
+    newsize: Optional[int] = None,
 ) -> ColumnBase:
     """Allocate a new column like the given *column*"""
     if dtype is None:
@@ -1403,8 +1409,10 @@ def column_empty_like(
         and is_categorical_dtype(column.dtype)
         and dtype == column.dtype
     ):
-        column = cast("cudf.core.column.CategoricalColumn", column)
-        codes = column_empty_like(column.codes, masked=masked, newsize=newsize)
+        catcolumn = cast("cudf.core.column.CategoricalColumn", column)
+        codes = column_empty_like(
+            catcolumn.codes, masked=masked, newsize=newsize
+        )
         return build_column(
             data=None,
             dtype=dtype,
@@ -1494,10 +1502,10 @@ def build_column(
     data: Union[Buffer, None],
     dtype: Dtype,
     *,
-    size: int = None,
-    mask: Buffer = None,
+    size: Optional[int] = None,
+    mask: Optional[Buffer] = None,
     offset: int = 0,
-    null_count: int = None,
+    null_count: Optional[int] = None,
     children: Tuple[ColumnBase, ...] = (),
 ) -> ColumnBase:
     """
@@ -1666,10 +1674,10 @@ def build_column(
 def build_categorical_column(
     categories: ColumnBase,
     codes: ColumnBase,
-    mask: Buffer = None,
-    size: int = None,
+    mask: Optional[Buffer] = None,
+    size: Optional[int] = None,
     offset: int = 0,
-    null_count: int = None,
+    null_count: Optional[int] = None,
     ordered: bool = False,
 ) -> "cudf.core.column.CategoricalColumn":
     """
@@ -1757,10 +1765,10 @@ def build_interval_column(
 def build_list_column(
     indices: ColumnBase,
     elements: ColumnBase,
-    mask: Buffer = None,
-    size: int = None,
+    mask: Optional[Buffer] = None,
+    size: Optional[int] = None,
     offset: int = 0,
-    null_count: int = None,
+    null_count: Optional[int] = None,
 ) -> "cudf.core.column.ListColumn":
     """
     Build a ListColumn
@@ -1803,10 +1811,10 @@ def build_struct_column(
     names: Sequence[str],
     children: Tuple[ColumnBase, ...],
     dtype: Optional[Dtype] = None,
-    mask: Buffer = None,
-    size: int = None,
+    mask: Optional[Buffer] = None,
+    size: Optional[int] = None,
     offset: int = 0,
-    null_count: int = None,
+    null_count: Optional[int] = None,
 ) -> "cudf.core.column.StructColumn":
     """
     Build a StructColumn
@@ -1863,9 +1871,9 @@ def _make_copy_replacing_NaT_with_null(column):
 
 def as_column(
     arbitrary: Any,
-    nan_as_null: bool = None,
-    dtype: Dtype = None,
-    length: int = None,
+    nan_as_null: Optional[bool] = None,
+    dtype: Optional[Dtype] = None,
+    length: Optional[int] = None,
 ):
     """Create a Column from an arbitrary object
 
@@ -2106,7 +2114,6 @@ def as_column(
 
             data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
         elif arb_dtype.kind == "m":
-
             time_unit = get_time_unit(arbitrary)
             cast_dtype = time_unit in ("D", "W", "M", "Y")
 
@@ -2466,7 +2473,7 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
 
 def arange(
     start: Union[int, float],
-    stop: Union[int, float] = None,
+    stop: Optional[Union[int, float]] = None,
     step: Union[int, float] = 1,
     dtype=None,
 ) -> cudf.core.column.NumericalColumn:
@@ -2524,7 +2531,9 @@ def arange(
     )
 
 
-def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
+def full(
+    size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
+) -> ColumnBase:
     """
     Returns a column of given size and dtype, filled with a given value.
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 4c65a631adc..c0a2a6ac546 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -6,7 +6,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Mapping, Sequence, cast
+from typing import Any, Mapping, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -125,10 +125,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: DtypeObj,
-        mask: Buffer = None,
-        size: int = None,  # TODO: make non-optional
+        mask: Optional[Buffer] = None,
+        size: Optional[int] = None,  # TODO: make non-optional
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
 
@@ -202,7 +202,10 @@ def day_of_year(self) -> ColumnBase:
         return self.get_dt_field("day_of_year")
 
     def to_pandas(
-        self, index: pd.Index = None, nullable: bool = False, **kwargs
+        self,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        **kwargs,
     ) -> "cudf.Series":
         # Workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
@@ -363,7 +366,7 @@ def mean(
 
     def std(
         self,
-        skipna: bool = None,
+        skipna: Optional[bool] = None,
         min_count: int = 0,
         dtype: Dtype = np.float64,
         ddof: int = 1,
@@ -375,7 +378,7 @@ def std(
             * _unit_to_nanoseconds_conversion[self.time_unit],
         )
 
-    def median(self, skipna: bool = None) -> pd.Timestamp:
+    def median(self, skipna: Optional[bool] = None) -> pd.Timestamp:
         return pd.Timestamp(
             self.as_numerical.median(skipna=skipna), unit=self.time_unit
         )
@@ -451,7 +454,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def fillna(
-        self, fill_value: Any = None, method: str = None, dtype: Dtype = None
+        self,
+        fill_value: Any = None,
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
     ) -> DatetimeColumn:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
@@ -495,7 +501,6 @@ def isin(self, values: Sequence) -> ColumnBase:
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         if np.issubdtype(to_dtype, np.datetime64):
-
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
@@ -542,10 +547,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: pd.DatetimeTZDtype,
-        mask: Buffer = None,
-        size: int = None,
+        mask: Optional[Buffer] = None,
+        size: Optional[int] = None,
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
     ):
         super().__init__(
             data=data,
@@ -558,7 +563,10 @@ def __init__(
         self._dtype = dtype
 
     def to_pandas(
-        self, index: pd.Index = None, nullable: bool = False, **kwargs
+        self,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        **kwargs,
     ) -> "cudf.Series":
         return self._local_time.to_pandas().dt.tz_localize(
             self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 9fc7663ffca..420637c1924 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -2,7 +2,7 @@
 
 import warnings
 from decimal import Decimal
-from typing import Any, Sequence, Union, cast
+from typing import Any, Optional, Sequence, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -103,7 +103,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
         return result
 
     def fillna(
-        self, value: Any = None, method: str = None, dtype: Dtype = None
+        self,
+        value: Any = None,
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
     ):
         """Fill null values with ``value``.
 
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 657403a6082..38384d09126 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from typing import Optional
+
 import pandas as pd
 import pyarrow as pa
 
@@ -19,7 +21,6 @@ def __init__(
         children=(),
         closed="right",
     ):
-
         super().__init__(
             data=None,
             dtype=dtype,
@@ -124,7 +125,9 @@ def as_interval_column(self, dtype, **kwargs):
         else:
             raise ValueError("dtype must be IntervalDtype")
 
-    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+    def to_pandas(
+        self, index: Optional[pd.Index] = None, **kwargs
+    ) -> pd.Series:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 4eea64c00d3..0bb9f70f851 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf._lib.copying import segmented_gather
@@ -261,7 +262,7 @@ def as_string_column(
         # Call libcudf to format the list column
         return format_list_column(lc, separators)
 
-    def _transform_leaves(self, func, *args, **kwargs):
+    def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure
         # as ``self``, but with the leaf column transformed
         # by applying ``func`` to it
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 87e73d212ef..39798320f88 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -81,10 +81,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: DtypeObj,
-        mask: Buffer = None,
-        size: int = None,  # TODO: make this non-optional
+        mask: Optional[Buffer] = None,
+        size: Optional[int] = None,  # TODO: make this non-optional
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
 
@@ -428,11 +428,11 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
-    def _can_return_nan(self, skipna: bool = None) -> bool:
+    def _can_return_nan(self, skipna: Optional[bool] = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
     def _process_for_reduction(
-        self, skipna: bool = None, min_count: int = 0
+        self, skipna: Optional[bool] = None, min_count: int = 0
     ) -> Union[NumericalColumn, ScalarLike]:
         skipna = True if skipna is None else skipna
 
@@ -516,8 +516,8 @@ def find_and_replace(
     def fillna(
         self,
         fill_value: Any = None,
-        method: str = None,
-        dtype: Dtype = None,
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
         fill_nan: bool = True,
     ) -> NumericalColumn:
         """
@@ -684,7 +684,6 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             ):
                 return True
             else:
-
                 filled = self.fillna(0)
                 return (
                     cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
@@ -720,8 +719,11 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         return self
 
     def to_pandas(
-        self, index: pd.Index = None, nullable: bool = False, **kwargs
-    ) -> "pd.Series":
+        self,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        **kwargs,
+    ) -> pd.Series:
         if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index bb7711a3ead..08c2f7cc7b1 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
 
-from typing import cast
+from typing import Optional, cast
 
 import numpy as np
 
@@ -40,10 +40,10 @@ class NumericalBaseColumn(ColumnBase, Scannable):
         "cummax",
     }
 
-    def _can_return_nan(self, skipna: bool = None) -> bool:
+    def _can_return_nan(self, skipna: Optional[bool] = None) -> bool:
         return not skipna and self.has_nulls()
 
-    def kurtosis(self, skipna: bool = None) -> float:
+    def kurtosis(self, skipna: Optional[bool] = None) -> float:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
@@ -68,7 +68,7 @@ def kurtosis(self, skipna: bool = None) -> float:
         kurt = term_one_section_one * term_one_section_two - 3 * term_two
         return kurt
 
-    def skew(self, skipna: bool = None) -> ScalarLike:
+    def skew(self, skipna: Optional[bool] = None) -> ScalarLike:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
@@ -122,26 +122,39 @@ def quantile(
             )
         return result
 
-    def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64):
+    def mean(
+        self,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        dtype=np.float64,
+    ):
         return self._reduce(
             "mean", skipna=skipna, min_count=min_count, dtype=dtype
         )
 
     def var(
-        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+        self,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        dtype=np.float64,
+        ddof=1,
     ):
         return self._reduce(
             "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
         )
 
     def std(
-        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+        self,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        dtype=np.float64,
+        ddof=1,
     ):
         return self._reduce(
             "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
         )
 
-    def median(self, skipna: bool = None) -> NumericalBaseColumn:
+    def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
         if self._can_return_nan(skipna=skipna):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fefa7beb562..a3163f1cebe 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -259,12 +259,14 @@ def byte_count(self) -> SeriesOrIndex:
         )
 
     @overload
-    def cat(self, sep: str = None, na_rep: str = None) -> str:
+    def cat(
+        self, sep: Optional[str] = None, na_rep: Optional[str] = None
+    ) -> str:
         ...
 
     @overload
     def cat(
-        self, others, sep: str = None, na_rep: str = None
+        self, others, sep: Optional[str] = None, na_rep: Optional[str] = None
     ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]:
         ...
 
@@ -744,7 +746,7 @@ def contains(
         4    False
         dtype: bool
 
-        The ``pat`` may also be a list of strings in which case
+        The ``pat`` may also be a sequence of strings in which case
         the individual strings are searched in corresponding rows.
 
         >>> s2 = cudf.Series(['house', 'dog', 'and', '', ''])
@@ -756,8 +758,6 @@ def contains(
         4     <NA>
         dtype: bool
         """  # noqa W605
-        if case is not True:
-            raise NotImplementedError("`case` parameter is not yet supported")
         if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
         if regex and isinstance(pat, re.Pattern):
@@ -767,25 +767,34 @@ def contains(
             raise NotImplementedError(
                 "unsupported value for `flags` parameter"
             )
-
-        if pat is None:
-            result_col = column.column_empty(
-                len(self._column), dtype="bool", masked=True
+        if regex and not case:
+            raise NotImplementedError(
+                "`case=False` only supported when `regex=False`"
             )
-        elif is_scalar(pat):
+
+        if is_scalar(pat):
             if regex:
                 result_col = libstrings.contains_re(self._column, pat, flags)
             else:
-                result_col = libstrings.contains(
-                    self._column, cudf.Scalar(pat, "str")
-                )
+                if case is False:
+                    input_column = libstrings.to_lower(self._column)
+                    pat = cudf.Scalar(pat.lower(), dtype="str")  # type: ignore
+                else:
+                    input_column = self._column
+                    pat = cudf.Scalar(pat, dtype="str")  # type: ignore
+                result_col = libstrings.contains(input_column, pat)
         else:
-            result_col = libstrings.contains_multiple(
-                self._column, column.as_column(pat, dtype="str")
-            )
+            # TODO: we silently ignore the `regex=` flag here
+            if case is False:
+                input_column = libstrings.to_lower(self._column)
+                pat = libstrings.to_lower(column.as_column(pat, dtype="str"))
+            else:
+                input_column = self._column
+                pat = column.as_column(pat, dtype="str")
+            result_col = libstrings.contains_multiple(input_column, pat)
         return self._return_or_inplace(result_col)
 
-    def like(self, pat: str, esc: str = None) -> SeriesOrIndex:
+    def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex:
         """
         Test if a like pattern matches a string of a Series or Index.
 
@@ -1065,7 +1074,10 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
         )
 
     def slice(
-        self, start: int = None, stop: int = None, step: int = None
+        self,
+        start: Optional[int] = None,
+        stop: Optional[int] = None,
+        step: Optional[int] = None,
     ) -> SeriesOrIndex:
         """
         Slice substrings from each element in the Series or Index.
@@ -2040,7 +2052,7 @@ def istitle(self) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.is_title(self._column))
 
     def filter_alphanum(
-        self, repl: str = None, keep: bool = True
+        self, repl: Optional[str] = None, keep: bool = True
     ) -> SeriesOrIndex:
         """
         Remove non-alphanumeric characters from strings in this column.
@@ -2126,7 +2138,10 @@ def slice_from(
         )
 
     def slice_replace(
-        self, start: int = None, stop: int = None, repl: str = None
+        self,
+        start: Optional[int] = None,
+        stop: Optional[int] = None,
+        repl: Optional[str] = None,
     ) -> SeriesOrIndex:
         """
         Replace the specified section of each string with a new string.
@@ -2214,7 +2229,9 @@ def slice_replace(
             ),
         )
 
-    def insert(self, start: int = 0, repl: str = None) -> SeriesOrIndex:
+    def insert(
+        self, start: int = 0, repl: Optional[str] = None
+    ) -> SeriesOrIndex:
         """
         Insert the specified string into each string in the specified
         position.
@@ -2394,10 +2411,10 @@ def get_json_object(
 
     def split(
         self,
-        pat: str = None,
+        pat: Optional[str] = None,
         n: int = -1,
         expand: bool = False,
-        regex: bool = None,
+        regex: Optional[bool] = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
@@ -2562,10 +2579,10 @@ def split(
 
     def rsplit(
         self,
-        pat: str = None,
+        pat: Optional[str] = None,
         n: int = -1,
         expand: bool = False,
-        regex: bool = None,
+        regex: Optional[bool] = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
@@ -3214,7 +3231,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex:
             libstrings.rjust(self._column, width, fillchar)
         )
 
-    def strip(self, to_strip: str = None) -> SeriesOrIndex:
+    def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3273,7 +3290,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
             libstrings.strip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
-    def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
+    def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3320,7 +3337,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
             libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
-    def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
+    def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3973,7 +3990,9 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex:
         )
         return self._return_or_inplace(result)
 
-    def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex:
+    def find(
+        self, sub: str, start: int = 0, end: Optional[int] = None
+    ) -> SeriesOrIndex:
         """
         Return lowest indexes in each strings in the Series/Index
         where the substring is fully contained between ``[start:end]``.
@@ -4029,7 +4048,7 @@ def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex:
         return self._return_or_inplace(result_col)
 
     def rfind(
-        self, sub: str, start: int = 0, end: int = None
+        self, sub: str, start: int = 0, end: Optional[int] = None
     ) -> SeriesOrIndex:
         """
         Return highest indexes in each strings in the Series/Index
@@ -4090,7 +4109,7 @@ def rfind(
         return self._return_or_inplace(result_col)
 
     def index(
-        self, sub: str, start: int = 0, end: int = None
+        self, sub: str, start: int = 0, end: Optional[int] = None
     ) -> SeriesOrIndex:
         """
         Return lowest indexes in each strings where the substring
@@ -4152,7 +4171,7 @@ def index(
             return result
 
     def rindex(
-        self, sub: str, start: int = 0, end: int = None
+        self, sub: str, start: int = 0, end: Optional[int] = None
     ) -> SeriesOrIndex:
         """
         Return highest indexes in each strings where the substring
@@ -4419,7 +4438,7 @@ def translate(self, table: dict) -> SeriesOrIndex:
         )
 
     def filter_characters(
-        self, table: dict, keep: bool = True, repl: str = None
+        self, table: dict, keep: bool = True, repl: Optional[str] = None
     ) -> SeriesOrIndex:
         """
         Remove characters from each string using the character ranges
@@ -4870,7 +4889,7 @@ def ngrams_tokenize(
         )
 
     def replace_tokens(
-        self, targets, replacements, delimiter: str = None
+        self, targets, replacements, delimiter: Optional[str] = None
     ) -> SeriesOrIndex:
         """
         The targets tokens are searched for within each string in the series
@@ -4955,8 +4974,8 @@ def replace_tokens(
     def filter_tokens(
         self,
         min_token_length: int,
-        replacement: str = None,
-        delimiter: str = None,
+        replacement: Optional[str] = None,
+        delimiter: Optional[str] = None,
     ) -> SeriesOrIndex:
         """
         Remove tokens from within each string in the series that are
@@ -5344,10 +5363,10 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
-        mask: Buffer = None,
-        size: int = None,  # TODO: make non-optional
+        mask: Optional[Buffer] = None,
+        size: Optional[int] = None,  # TODO: make non-optional
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
         children: Tuple["column.ColumnBase", ...] = (),
     ):
         dtype = cudf.api.types.dtype("object")
@@ -5477,8 +5496,8 @@ def to_arrow(self) -> pa.Array:
 
     def sum(
         self,
-        skipna: bool = None,
-        dtype: Dtype = None,
+        skipna: Optional[bool] = None,
+        dtype: Optional[Dtype] = None,
         min_count: int = 0,
     ):
         result_col = self._process_for_reduction(
@@ -5609,8 +5628,11 @@ def values(self) -> cupy.ndarray:
         raise TypeError("String Arrays is not yet implemented in cudf")
 
     def to_pandas(
-        self, index: pd.Index = None, nullable: bool = False, **kwargs
-    ) -> "pd.Series":
+        self,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        **kwargs,
+    ) -> pd.Series:
         if nullable:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
             pd_series = pd.Series(pandas_array, copy=False)
@@ -5679,8 +5701,8 @@ def find_and_replace(
     def fillna(
         self,
         fill_value: Any = None,
-        method: str = None,
-        dtype: Dtype = None,
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
     ) -> StringColumn:
         if fill_value is not None:
             if not is_scalar(fill_value):
@@ -5828,7 +5850,6 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
 
 def _get_cols_list(parent_obj, others):
-
     parent_index = (
         parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj
     )
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 6838d711641..8558faa7f24 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 from functools import cached_property
+from typing import Optional
 
 import pandas as pd
 import pyarrow as pa
@@ -57,7 +58,9 @@ def to_arrow(self):
             pa_type, len(self), buffers, children=children
         )
 
-    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+    def to_pandas(
+        self, index: Optional[pd.Index] = None, **kwargs
+    ) -> pd.Series:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e7979fa4d27..e1d913742ec 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import datetime
-from typing import Any, Sequence, cast
+from typing import Any, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -80,10 +80,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: Dtype,
-        size: int = None,  # TODO: make non-optional
-        mask: Buffer = None,
+        size: Optional[int] = None,  # TODO: make non-optional
+        mask: Optional[Buffer] = None,
         offset: int = 0,
-        null_count: int = None,
+        null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
 
@@ -251,7 +251,10 @@ def time_unit(self) -> str:
         return self._time_unit
 
     def fillna(
-        self, fill_value: Any = None, method: str = None, dtype: Dtype = None
+        self,
+        fill_value: Any = None,
+        method: Optional[str] = None,
+        dtype: Optional[Dtype] = None,
     ) -> TimeDeltaColumn:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
@@ -313,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
             unit=self.time_unit,
         )
 
-    def median(self, skipna: bool = None) -> pd.Timedelta:
+    def median(self, skipna: Optional[bool] = None) -> pd.Timedelta:
         return pd.Timedelta(
             self.as_numerical.median(skipna=skipna), unit=self.time_unit
         )
@@ -340,9 +343,9 @@ def quantile(
 
     def sum(
         self,
-        skipna: bool = None,
+        skipna: Optional[bool] = None,
         min_count: int = 0,
-        dtype: Dtype = None,
+        dtype: Optional[Dtype] = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
@@ -356,7 +359,7 @@ def sum(
 
     def std(
         self,
-        skipna: bool = None,
+        skipna: Optional[bool] = None,
         min_count: int = 0,
         dtype: Dtype = np.float64,
         ddof: int = 1,
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 707eda3f5e6..832d5acf2de 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -101,7 +101,7 @@ class ColumnAccessor(abc.MutableMapping):
 
     def __init__(
         self,
-        data: Union[abc.MutableMapping, ColumnAccessor] = None,
+        data: Union[abc.MutableMapping, ColumnAccessor, None] = None,
         multiindex: bool = False,
         level_names=None,
     ):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8c8f0119b3f..2aa370ac8e5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -22,7 +22,6 @@
     Optional,
     Set,
     Tuple,
-    TypeVar,
     Union,
 )
 
@@ -37,6 +36,7 @@
 from pandas.core.dtypes.common import is_float, is_integer
 from pandas.io.formats import console
 from pandas.io.formats.printing import pprint_thing
+from typing_extensions import Self
 
 import cudf
 import cudf.core.common
@@ -105,9 +105,6 @@
     _external_only_api,
 )
 
-T = TypeVar("T", bound="DataFrame")
-
-
 _cupy_nan_methods_map = {
     "min": "nanmin",
     "max": "nanmax",
@@ -1306,7 +1303,7 @@ def __delitem__(self, name):
         self._drop_column(name)
 
     @_cudf_nvtx_annotate
-    def _slice(self: T, arg: slice) -> T:
+    def _slice(self, arg: slice) -> Self:
         """
         _slice : slice the frame as per the arg
 
@@ -1988,8 +1985,8 @@ def from_dict(
         cls,
         data: dict,
         orient: str = "columns",
-        dtype: Dtype = None,
-        columns: list = None,
+        dtype: Optional[Dtype] = None,
+        columns: Optional[list] = None,
     ) -> DataFrame:
         """
         Construct DataFrame from dict of array-like or dicts.
@@ -7139,13 +7136,14 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 "Keyword arguments other than `inplace` are not supported"
             )
 
-        # Have to use a regex match to avoid capturing "=="
-        includes_assignment = re.search("[^=]=[^=]", expr) is not None
+        # Have to use a regex match to avoid capturing ==, >=, or <=
+        equals_sign_regex = "[^=><]=[^=]"
+        includes_assignment = re.search(equals_sign_regex, expr) is not None
 
         # Check if there were multiple statements. Filter out empty lines.
         statements = tuple(filter(None, expr.strip().split("\n")))
         if len(statements) > 1 and any(
-            re.search("[^=]=[^=]", st) is None for st in statements
+            re.search(equals_sign_regex, st) is None for st in statements
         ):
             raise ValueError(
                 "Multi-line expressions are only valid if all expressions "
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 230a5054a00..26b1ee971da 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -16,7 +16,6 @@
     MutableMapping,
     Optional,
     Tuple,
-    TypeVar,
     Union,
 )
 
@@ -24,6 +23,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -45,8 +45,6 @@
 from cudf.utils.dtypes import find_common_type
 from cudf.utils.utils import _array_ufunc, _cudf_nvtx_annotate
 
-T = TypeVar("T", bound="Frame")
-
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
 class Frame(BinaryOperand, Scannable):
@@ -146,8 +144,8 @@ def _from_columns_like_self(
         return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
 
     def _mimic_inplace(
-        self: T, result: T, inplace: bool = False
-    ) -> Optional[Frame]:
+        self, result: Self, inplace: bool = False
+    ) -> Optional[Self]:
         if inplace:
             for col in self._data:
                 if col in result._data:
@@ -1069,7 +1067,6 @@ def from_arrow(cls, data):
         # Handle dict arrays
         cudf_category_frame = {}
         if len(dict_indices):
-
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
             indices_columns = libcudf.interop.from_arrow(dict_indices_table)
@@ -1191,11 +1188,11 @@ def _positions_from_column_names(self, column_names):
         ]
 
     def _copy_type_metadata(
-        self: T,
-        other: T,
+        self,
+        other: Self,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> T:
+    ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
         column of `self`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 0b6ea1fb646..f955c5dfbd6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1095,8 +1095,13 @@ def _normalize_aggs(
             columns = values._columns
             aggs_per_column = (aggs,) * len(columns)
 
+        # is_list_like performs type narrowing but type-checkers don't
+        # know it. One could add a TypeGuard annotation to
+        # is_list_like (see PEP647), but that is less useful than it
+        # seems because unlike the builtin narrowings it only performs
+        # narrowing in the positive case.
         normalized_aggs = [
-            list(agg) if is_list_like(agg) else [agg]
+            list(agg) if is_list_like(agg) else [agg]  # type: ignore
             for agg in aggs_per_column
         ]
         return column_names, columns, normalized_aggs
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 281290e1788..f5dc8298bd2 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -15,7 +15,6 @@
     Optional,
     Tuple,
     Type,
-    TypeVar,
     Union,
 )
 
@@ -65,8 +64,6 @@
 )
 from cudf.utils.utils import _cudf_nvtx_annotate, search_range
 
-T = TypeVar("T", bound="Frame")
-
 
 def _lexsorted_equal_range(
     idx: Union[GenericIndex, cudf.MultiIndex],
@@ -1048,7 +1045,7 @@ def _from_data(
 
     def _binaryop(
         self,
-        other: T,
+        other: Frame,
         op: str,
         fill_value: Any = None,
         *args,
@@ -2601,7 +2598,6 @@ def __init__(
         copy=False,
         name=None,
     ):
-
         if freq is not None:
             raise NotImplementedError("freq is not yet supported")
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 95931af038c..3c4d8d84c34 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -27,6 +27,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 
 import cudf
 import cudf._lib as libcudf
@@ -224,8 +225,6 @@ def __init__(self, frame):
 _LocIndexerClass = TypeVar("_LocIndexerClass", bound="_FrameIndexer")
 _IlocIndexerClass = TypeVar("_IlocIndexerClass", bound="_FrameIndexer")
 
-T = TypeVar("T", bound="IndexedFrame")
-
 
 class IndexedFrame(Frame):
     """A frame containing an index.
@@ -357,8 +356,8 @@ def _from_columns_like_self(
         )
 
     def _mimic_inplace(
-        self: T, result: T, inplace: bool = False
-    ) -> Optional[Frame]:
+        self, result: Self, inplace: bool = False
+    ) -> Optional[Self]:
         if inplace:
             self._index = result._index
         return super()._mimic_inplace(result, inplace)
@@ -441,7 +440,7 @@ def _scan(self, op, axis=None, skipna=True):
             results[name] = getattr(result_col, op)()
         return self._from_data(results, self._index)
 
-    def _check_data_index_length_match(self: T) -> None:
+    def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
         if self._data.nrows > 0 and self._data.nrows != len(self._index):
@@ -450,7 +449,7 @@ def _check_data_index_length_match(self: T) -> None:
                 f"match length of index ({len(self._index)})"
             )
 
-    def copy(self: T, deep: bool = True) -> T:
+    def copy(self, deep: bool = True) -> Self:
         """Make a copy of this object's indices and data.
 
         When ``deep=True`` (default), a new object will be created with a
@@ -918,12 +917,12 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         return self._mimic_inplace(output, inplace=inplace)
 
     def _copy_type_metadata(
-        self: T,
-        other: T,
+        self,
+        other: Self,
         include_index: bool = True,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> T:
+    ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
         column of `self`.
@@ -2322,12 +2321,12 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
             raise ValueError('keep must be either "first", "last"')
 
     def _align_to_index(
-        self: T,
+        self,
         index: ColumnLike,
         how: str = "outer",
         sort: bool = True,
         allow_non_unique: bool = False,
-    ) -> T:
+    ) -> Self:
         index = cudf.core.index.as_index(index)
 
         if self.index.equals(index):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 4a9bc89fa34..edabdb34435 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -101,7 +101,6 @@ def __init__(
         name=None,
         **kwargs,
     ):
-
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")
         if name is not None:
@@ -811,7 +810,6 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
 
     @_cudf_nvtx_annotate
     def _index_and_downcast(self, result, index, index_key):
-
         if isinstance(index_key, (numbers.Number, slice)):
             index_key = [index_key]
         if (
@@ -1069,7 +1067,6 @@ def _is_interval(self):
     @classmethod
     @_cudf_nvtx_annotate
     def _concat(cls, objs):
-
         source_data = [o.to_frame(index=False) for o in objs]
 
         # TODO: Verify if this is really necessary or if we can rely on
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index c4128621148..5f07c57fc80 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, Optional, Tuple, TypeVar, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import cupy
 import numpy as np
@@ -20,8 +20,6 @@
 from cudf.core.frame import Frame
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
 
-T = TypeVar("T", bound="Frame")
-
 
 class SingleColumnFrame(Frame, NotIterable):
     """A one-dimensional frame.
@@ -140,7 +138,6 @@ def to_numpy(
         return super().to_numpy(dtype, copy, na_value).flatten()
 
     def tolist(self):  # noqa: D102
-
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 92ef49e92d9..0ee9f511061 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,15 +1,16 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import math
 import re
 import warnings
-from typing import Sequence, Type, TypeVar, Union
+from typing import Sequence, Union
 
 import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
 from pandas.core.tools.datetimes import _unit_map
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -373,9 +374,6 @@ def get_units(value):
     return value
 
 
-_T = TypeVar("_T", bound="DateOffset")
-
-
 class DateOffset:
     """
     An object used for binary ops where calendrical arithmetic
@@ -647,7 +645,7 @@ def __repr__(self):
         return repr_str
 
     @classmethod
-    def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
+    def _from_freqstr(cls, freqstr: str) -> Self:
         """
         Parse a string and return a DateOffset object
         expects strings of the form 3D, 25W, 10ms, 42ns, etc.
@@ -669,9 +667,9 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
 
     @classmethod
     def _from_pandas_ticks_or_weeks(
-        cls: Type[_T],
+        cls,
         tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week],
-    ) -> _T:
+    ) -> Self:
         return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n})
 
     def _maybe_as_fast_pandas_offset(self):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b3c8468c119..918bd995ed1 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9794,6 +9794,7 @@ def df_eval(request):
         ("a / b", float),
         ("a * b", int),
         ("a > b", int),
+        ("a >= b", int),
         ("a > b > c", int),
         ("a > b < c", int),
         ("a & b", int),
@@ -9835,7 +9836,7 @@ def test_dataframe_eval(df_eval, expr, dtype):
     assert_eq(expect, got, check_names=False)
 
     # Test inplace
-    if re.search("[^=]=[^=]", expr) is not None:
+    if re.search("[^=><]=[^=]", expr) is not None:
         pdf_eval = df_eval.to_pandas()
         pdf_eval.eval(expr, inplace=True)
         df_eval.eval(expr, inplace=True)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 33095761fde..7fcad5df9f1 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -1896,12 +1896,10 @@ def test_reader_empty_stripe(datadir, fname):
     assert_eq(expected, got)
 
 
-@pytest.mark.xfail(
-    reason="https://github.com/rapidsai/cudf/issues/11890", raises=RuntimeError
-)
-def test_reader_unsupported_offsets():
-    # needs enough data for more than one row group
-    expected = cudf.DataFrame({"str": ["*"] * 10001}, dtype="string")
+# needs enough data for multiple row groups
+@pytest.mark.parametrize("data", [["*"] * 10001, ["**", None] * 5001])
+def test_reader_row_index_order(data):
+    expected = cudf.DataFrame({"str": data}, dtype="string")
 
     buffer = BytesIO()
     expected.to_pandas().to_orc(buffer)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index c866e064366..12e832ba23b 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -820,6 +820,17 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise):
         assert_eq(expect, got)
 
 
+def test_string_contains_case(ps_gs):
+    ps, gs = ps_gs
+    with pytest.raises(NotImplementedError):
+        gs.str.contains("A", case=False)
+    expected = ps.str.contains("A", regex=False, case=False)
+    got = gs.str.contains("A", regex=False, case=False)
+    assert_eq(expected, got)
+    got = gs.str.contains("a", regex=False, case=False)
+    assert_eq(expected, got)
+
+
 @pytest.mark.parametrize(
     "pat,esc,expect",
     [
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index f19c373150d..67fc7215fc7 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -30,28 +30,31 @@
 
 
 class CudfEngine(ArrowDatasetEngine):
-    @staticmethod
-    def read_metadata(*args, **kwargs):
-        meta, stats, parts, index = ArrowDatasetEngine.read_metadata(
-            *args, **kwargs
+    @classmethod
+    def _create_dd_meta(cls, dataset_info, **kwargs):
+        # Start with pandas-version of meta
+        meta_pd = super()._create_dd_meta(dataset_info, **kwargs)
+
+        # Convert to cudf
+        meta_cudf = cudf.from_pandas(meta_pd)
+
+        # Re-set "object" dtypes to align with pa schema
+        kwargs = dataset_info.get("kwargs", {})
+        set_object_dtypes_from_pa_schema(
+            meta_cudf,
+            kwargs.get("schema", None),
         )
-        new_meta = cudf.from_pandas(meta)
-        if parts:
-            # Re-set "object" dtypes align with pa schema
-            set_object_dtypes_from_pa_schema(
-                new_meta,
-                parts[0].get("common_kwargs", {}).get("schema", None),
-            )
 
         # If `strings_to_categorical==True`, convert objects to int32
         strings_to_cats = kwargs.get("strings_to_categorical", False)
-        for col in new_meta._data.names:
+        for col in meta_cudf._data.names:
             if (
-                isinstance(new_meta._data[col], cudf.core.column.StringColumn)
+                isinstance(meta_cudf._data[col], cudf.core.column.StringColumn)
                 and strings_to_cats
             ):
-                new_meta._data[col] = new_meta._data[col].astype("int32")
-        return (new_meta, stats, parts, index)
+                meta_cudf._data[col] = meta_cudf._data[col].astype("int32")
+
+        return meta_cudf
 
     @classmethod
     def multi_support(cls):