From 3403eb4005bdddd0c8b3b2b5e5a33ad87c0a1d13 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:08:41 +0200
Subject: [PATCH 1/9] Move `vsmem` helper into their own file

---
 .../device/dispatch/dispatch_merge_sort.cuh   |   1 +
 .../device/dispatch/dispatch_select_if.cuh    |   1 +
 .../dispatch/dispatch_unique_by_key.cuh       |   1 +
 cub/cub/util_device.cuh                       | 196 --------------
 cub/cub/util_vsmem.cuh                        | 255 ++++++++++++++++++
 cub/test/catch2_test_vsmem.cu                 |   1 +
 libcudacxx/include/cuda/discard_memory        |   1 -
 7 files changed, 259 insertions(+), 197 deletions(-)
 create mode 100644 cub/cub/util_vsmem.cuh
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 47420576030..e687cf5b010 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -42,6 +42,7 @@
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 #include <cub/util_namespace.cuh>
+#include <cub/util_vsmem.cuh>
 
 #include <thrust/detail/integer_math.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index a22f6acd031..332671383f6 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -52,6 +52,7 @@
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
+#include <cub/util_vsmem.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 9531a11b746..f62d6d27537 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -48,6 +48,7 @@
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
+#include <cub/util_vsmem.cuh>
 
 #include <iterator>
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 8075444c526..7dc9fc1aa25 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -49,12 +49,10 @@
 
 #include <cub/detail/device_synchronize.cuh>
 #include <cub/util_debug.cuh>
-#include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
 
-#include <cuda/discard_memory>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
@@ -70,7 +68,6 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
-
 /**
  * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD`
  * configurations of a given policy.
@@ -82,199 +79,6 @@ struct policy_wrapper_t : PolicyT
   static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
   static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
 };
-
-/**
- * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel.
- *
- */
-struct vsmem_t
-{
-  void* gmem_ptr;
-};
-
-// The maximum amount of static shared memory available per thread block
-// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
-static constexpr std::size_t max_smem_per_block = 48 * 1024;
-
-/**
- * @brief Class template that helps to prevent exceeding the available shared memory per thread block.
- *
- * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual
- * shared memory is needed.
- */
-template <typename AgentT>
-class vsmem_helper_impl
-{
-private:
-  // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size`
-  static constexpr std::size_t line_size = 128;
-
-  // The amount of shared memory or virtual shared memory required by the algorithm's agent
-  static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage);
-
-  // Whether we need to allocate global memory-backed virtual shared memory
-  static constexpr bool needs_vsmem = required_smem > max_smem_per_block;
-
-  // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory
-  static constexpr std::size_t padding_bytes =
-    (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size));
-
-public:
-  // Type alias to be used for static temporary storage declaration within the algorithm's kernel
-  using static_temp_storage_t = cub::detail::conditional_t<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
-
-  // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes
-  static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
-
-  /**
-   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
-   * passed to the agent, specialized for the case when we can use native shared memory as temporary
-   * storage.
-   */
-  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
-  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&)
-  {
-    return static_temp_storage;
-  }
-
-  /**
-   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
-   * passed to the agent, specialized for the case when we can use native shared memory as temporary
-   * storage and taking a linear block id.
-   */
-  static __device__ __forceinline__ typename AgentT::TempStorage&
-  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t)
-  {
-    return static_temp_storage;
-  }
-
-  /**
-   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
-   * passed to the agent, specialized for the case when we have to use global memory-backed
-   * virtual shared memory as temporary storage.
-   */
-  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
-  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem)
-  {
-    return *reinterpret_cast<typename AgentT::TempStorage*>(
-      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
-  }
-
-  /**
-   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
-   * passed to the agent, specialized for the case when we have to use global memory-backed
-   * virtual shared memory as temporary storage and taking a linear block id.
-   */
-  static __device__ __forceinline__ typename AgentT::TempStorage&
-  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id)
-  {
-    return *reinterpret_cast<typename AgentT::TempStorage*>(
-      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id));
-  }
-
-  /**
-   * @brief Hints to discard modified cache lines of the used virtual shared memory.
-   * modified cache lines.
-   *
-   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
-   * supposed to be reused after this function call.
-   */
-  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<!needs_vsmem_, int>::type = 0>
-  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
-  {
-    return false;
-  }
-
-  /**
-   * @brief Hints to discard modified cache lines of the used virtual shared memory.
-   * modified cache lines.
-   *
-   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
-   * supposed to be reused after this function call.
-   */
-  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<needs_vsmem_, int>::type = 0>
-  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
-  {
-    // Ensure all threads finished using temporary storage
-    CTA_SYNC();
-
-    const std::size_t linear_tid   = threadIdx.x;
-    const std::size_t block_stride = line_size * blockDim.x;
-
-    char* ptr    = reinterpret_cast<char*>(&temp_storage);
-    auto ptr_end = ptr + vsmem_per_block;
-
-    // 128 byte-aligned virtual shared memory discard
-    for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride)
-    {
-      cuda::discard_memory(thread_ptr, line_size);
-    }
-
-    return true;
-  }
-};
-
-template <class DefaultAgentT, class FallbackAgentT>
-constexpr bool use_fallback_agent()
-{
-  return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block)
-      && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block);
-}
-
-/**
- * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures:
- * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check
- * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory.
- * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by
- * global memory.
- *
- * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements
- * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed
- * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy
- * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by
- * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory
- * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback
- * agent falls within the bounds of `max_smem_per_block`.
- * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy
- */
-template <typename DefaultAgentPolicyT,
-          typename DefaultAgentT,
-          typename FallbackAgentPolicyT = DefaultAgentPolicyT,
-          typename FallbackAgentT       = DefaultAgentT,
-          bool UseFallbackPolicy        = use_fallback_agent<DefaultAgentT, FallbackAgentT>()>
-struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl<DefaultAgentT>
-{
-  using agent_t        = DefaultAgentT;
-  using agent_policy_t = DefaultAgentPolicyT;
-};
-template <typename DefaultAgentPolicyT, typename DefaultAgentT, typename FallbackAgentPolicyT, typename FallbackAgentT>
-struct vsmem_helper_with_fallback_impl<DefaultAgentPolicyT, DefaultAgentT, FallbackAgentPolicyT, FallbackAgentT, true>
-    : public vsmem_helper_impl<FallbackAgentT>
-{
-  using agent_t        = FallbackAgentT;
-  using agent_policy_t = FallbackAgentPolicyT;
-};
-
-/**
- * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the
- * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template
- * parameter pack.
- */
-template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
-using vsmem_helper_fallback_policy_t =
-  vsmem_helper_with_fallback_impl<DefaultPolicyT,
-                                  AgentT<DefaultPolicyT, AgentParamsT...>,
-                                  FallbackPolicyT,
-                                  AgentT<FallbackPolicyT, AgentParamsT...>>;
-
-/**
- * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis,
- * overwriting `64` threads per block and `1` item per thread.
- */
-template <typename DefaultPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
-using vsmem_helper_default_fallback_policy_t =
-  vsmem_helper_fallback_policy_t<DefaultPolicyT, policy_wrapper_t<DefaultPolicyT, 64, 1>, AgentT, AgentParamsT...>;
-
 } // namespace detail
 
 /**
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
new file mode 100644
index 00000000000..5eebc8533db
--- /dev/null
+++ b/cub/cub/util_vsmem.cuh
@@ -0,0 +1,255 @@
+/******************************************************************************
+ * Copyright (c) 2023-24, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * This file contains facilities that help to prevent exceeding the available shared memory per thread block
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_device.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/discard_memory>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+/**
+ * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel.
+ *
+ */
+struct vsmem_t
+{
+  void* gmem_ptr;
+};
+
+// The maximum amount of static shared memory available per thread block
+// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
+static constexpr std::size_t max_smem_per_block = 48 * 1024;
+
+/**
+ * @brief Class template that helps to prevent exceeding the available shared memory per thread block.
+ *
+ * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual
+ * shared memory is needed.
+ */
+template <typename AgentT>
+class vsmem_helper_impl
+{
+private:
+  // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size`
+  static constexpr std::size_t line_size = 128;
+
+  // The amount of shared memory or virtual shared memory required by the algorithm's agent
+  static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage);
+
+  // Whether we need to allocate global memory-backed virtual shared memory
+  static constexpr bool needs_vsmem = required_smem > max_smem_per_block;
+
+  // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory
+  static constexpr std::size_t padding_bytes =
+    (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size));
+
+public:
+  // Type alias to be used for static temporary storage declaration within the algorithm's kernel
+  using static_temp_storage_t = cub::detail::conditional_t<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
+
+  // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes
+  static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we can use native shared memory as temporary
+   * storage.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&)
+  {
+    return static_temp_storage;
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we can use native shared memory as temporary
+   * storage and taking a linear block id.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t)
+  {
+    return static_temp_storage;
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we have to use global memory-backed
+   * virtual shared memory as temporary storage.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem)
+  {
+    return *reinterpret_cast<typename AgentT::TempStorage*>(
+      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we have to use global memory-backed
+   * virtual shared memory as temporary storage and taking a linear block id.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id)
+  {
+    return *reinterpret_cast<typename AgentT::TempStorage*>(
+      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id));
+  }
+
+  /**
+   * @brief Hints to discard modified cache lines of the used virtual shared memory.
+   * modified cache lines.
+   *
+   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
+   * supposed to be reused after this function call.
+   */
+  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<!needs_vsmem_, int>::type = 0>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
+  {
+    return false;
+  }
+
+  /**
+   * @brief Hints to discard modified cache lines of the used virtual shared memory.
+   * modified cache lines.
+   *
+   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
+   * supposed to be reused after this function call.
+   */
+  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<needs_vsmem_, int>::type = 0>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
+  {
+    // Ensure all threads finished using temporary storage
+    CTA_SYNC();
+
+    const std::size_t linear_tid   = threadIdx.x;
+    const std::size_t block_stride = line_size * blockDim.x;
+
+    char* ptr    = reinterpret_cast<char*>(&temp_storage);
+    auto ptr_end = ptr + vsmem_per_block;
+
+    // 128 byte-aligned virtual shared memory discard
+    for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride)
+    {
+      cuda::discard_memory(thread_ptr, line_size);
+    }
+    return true;
+  }
+};
+
+template <class DefaultAgentT, class FallbackAgentT>
+constexpr bool use_fallback_agent()
+{
+  return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block)
+      && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block);
+}
+
+/**
+ * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures:
+ * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check
+ * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory.
+ * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by
+ * global memory.
+ *
+ * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements
+ * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed
+ * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy
+ * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by
+ * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory
+ * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback
+ * agent falls within the bounds of `max_smem_per_block`.
+ * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy
+ */
+template <typename DefaultAgentPolicyT,
+          typename DefaultAgentT,
+          typename FallbackAgentPolicyT = DefaultAgentPolicyT,
+          typename FallbackAgentT       = DefaultAgentT,
+          bool UseFallbackPolicy        = use_fallback_agent<DefaultAgentT, FallbackAgentT>()>
+struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl<DefaultAgentT>
+{
+  using agent_t        = DefaultAgentT;
+  using agent_policy_t = DefaultAgentPolicyT;
+};
+template <typename DefaultAgentPolicyT, typename DefaultAgentT, typename FallbackAgentPolicyT, typename FallbackAgentT>
+struct vsmem_helper_with_fallback_impl<DefaultAgentPolicyT, DefaultAgentT, FallbackAgentPolicyT, FallbackAgentT, true>
+    : public vsmem_helper_impl<FallbackAgentT>
+{
+  using agent_t        = FallbackAgentT;
+  using agent_policy_t = FallbackAgentPolicyT;
+};
+
+/**
+ * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the
+ * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template
+ * parameter pack.
+ */
+template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+using vsmem_helper_fallback_policy_t =
+  vsmem_helper_with_fallback_impl<DefaultPolicyT,
+                                  AgentT<DefaultPolicyT, AgentParamsT...>,
+                                  FallbackPolicyT,
+                                  AgentT<FallbackPolicyT, AgentParamsT...>>;
+
+/**
+ * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis,
+ * overwriting `64` threads per block and `1` item per thread.
+ */
+template <typename DefaultPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+using vsmem_helper_default_fallback_policy_t =
+  vsmem_helper_fallback_policy_t<DefaultPolicyT, policy_wrapper_t<DefaultPolicyT, 64, 1>, AgentT, AgentParamsT...>;
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu
index 1eab3d31ad2..b8d3f7e37be 100644
--- a/cub/test/catch2_test_vsmem.cu
+++ b/cub/test/catch2_test_vsmem.cu
@@ -31,6 +31,7 @@
 #include <cub/block/block_store.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
+#include <cub/util_vsmem.cuh>
 
 #include "catch2/catch.hpp"
 #include "catch2_test_helper.h"
diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory
index 5893bf6108e..cc4963874ae 100644
--- a/libcudacxx/include/cuda/discard_memory
+++ b/libcudacxx/include/cuda/discard_memory
@@ -22,7 +22,6 @@
 #endif // no system header
 
 #include <cuda/std/cstdint>
-#include <cuda/ptx>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 

From 8344490af89ebcaf7cfee66665c347ff47337670 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:09:10 +0200
Subject: [PATCH 2/9] Add missing includes of `cuda::std::min` and
 `cuda::std::max` to functional until we get `<algorithm>`

---
 libcudacxx/include/cuda/functional | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
index 955631e23a5..7820c8352cc 100644
--- a/libcudacxx/include/cuda/functional
+++ b/libcudacxx/include/cuda/functional
@@ -68,6 +68,8 @@
 #include <cuda/std/type_traits>
 #include <cuda/std/functional>
 #include <cuda/std/utility>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__algorithm/max.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 namespace __detail

From 24568f64de458a35073c0c899e424a3a8ad5badf Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:09:40 +0200
Subject: [PATCH 3/9] Add missing include of `<cuda_runtime_api.h>` to
 `cuda_pinned_memory_resource`

---
 .../include/cuda/__memory_resource/cuda_pinned_memory_resource.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
index b89f6cdcb17..1597cd2bcf5 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
@@ -25,6 +25,7 @@
 
 #  if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_CUDA_COMPILER_NVHPC)
 #    include <cuda_runtime.h>
+#    include <cuda_runtime_api.h>
 #  endif // !_CCCL_CUDA_COMPILER_NVCC && !_CCCL_CUDA_COMPILER_NVHPC
 
 #  include <cuda/__memory_resource/get_property.h>

From ab4cf5090f6ad8e6d8cd671b1b2554b6cb0213ec Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:09:59 +0200
Subject: [PATCH 4/9] Mark `_CCCL_FORCEINLINE` as inline on host

---
 libcudacxx/include/cuda/std/__cccl/execution_space.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/execution_space.h b/libcudacxx/include/cuda/std/__cccl/execution_space.h
index d72b08462dd..499f480717a 100644
--- a/libcudacxx/include/cuda/std/__cccl/execution_space.h
+++ b/libcudacxx/include/cuda/std/__cccl/execution_space.h
@@ -27,12 +27,12 @@
 #  define _CCCL_DEVICE      __device__
 #  define _CCCL_HOST_DEVICE __host__ __device__
 #  define _CCCL_FORCEINLINE __forceinline__
-#else // ^^^ __CUDACC__ || _NVHPC_CUDA ^^^ / vvv !__CUDACC__ && !_NVHPC_CUDA
+#else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER
 #  define _CCCL_HOST
 #  define _CCCL_DEVICE
 #  define _CCCL_HOST_DEVICE
-#  define _CCCL_FORCEINLINE
-#endif // !__CUDACC__ && !_NVHPC_CUDA
+#  define _CCCL_FORCEINLINE inline
+#endif // !_CCCL_CUDA_COMPILER
 
 #if !defined(_CCCL_EXEC_CHECK_DISABLE)
 #  if defined(_CCCL_CUDA_COMPILER_NVCC)

From 463aecd43ce3296656728b2d3a769ee9882b21c0 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:10:23 +0200
Subject: [PATCH 5/9] Avoid copying output iterators in `thrust::copy_if`

---
 thrust/thrust/system/cuda/detail/copy_if.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index b36285b2f2a..7706a6ac2a4 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -45,6 +45,7 @@
 #  include <cub/util_temporary_storage.cuh>
 #  include <cub/util_type.cuh>
 
+#  include <thrust/advance.h>
 #  include <thrust/detail/alignment.h>
 #  include <thrust/detail/cstdint.h>
 #  include <thrust/detail/function.h>
@@ -95,10 +96,9 @@ struct DispatchCopyIf
     size_t& temp_storage_bytes,
     InputIt first,
     StencilIt stencil,
-    OutputIt output,
+    OutputIt& output,
     Predicate predicate,
-    OffsetT num_items,
-    OutputIt& output_end)
+    OffsetT num_items)
   {
     using num_selected_out_it_t = OffsetT*;
     using equality_op_t         = cub::NullType;
@@ -147,7 +147,6 @@ struct DispatchCopyIf
     // Return for empty problems
     if (num_items == 0)
     {
-      output_end = output;
       return status;
     }
 
@@ -180,8 +179,7 @@ struct DispatchCopyIf
     status = cuda_cub::synchronize(policy);
     CUDA_CUB_RET_IF_FAIL(status);
     OffsetT num_selected = get_value(policy, d_num_selected_out);
-
-    output_end = output + num_selected;
+    thrust::advance(output, num_selected);
     return status;
   }
 };
@@ -197,8 +195,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
 {
   using size_type = typename iterator_traits<InputIt>::difference_type;
 
-  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
-  OutputIt output_end{};
+  size_type num_items       = static_cast<size_type>(thrust::distance(first, last));
   cudaError_t status        = cudaSuccess;
   size_t temp_storage_bytes = 0;
 
@@ -214,7 +211,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
     dispatch32_t::dispatch,
     dispatch64_t::dispatch,
     num_items,
-    (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed, output_end));
+    (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
   cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
 
   // Allocate temporary storage.
@@ -227,10 +224,10 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
     dispatch32_t::dispatch,
     dispatch64_t::dispatch,
     num_items,
-    (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed, output_end));
+    (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
   cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
 
-  return output_end;
+  return output;
 }
 
 } // namespace detail

From 162a1a76364904eb971bf26adbb229b1aa067f44 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 30 Apr 2024 19:10:51 +0200
Subject: [PATCH 6/9] Try to ensure that `thrust::tuple` and `thrust::pair`
 work with CTAD

---
 libcudacxx/include/cuda/std/__utility/pair.h  |  47 ++++----
 .../cuda/std/detail/libcxx/include/tuple      |  13 ---
 thrust/testing/pair.cu                        |  14 ++-
 thrust/testing/tuple.cu                       |  16 ++-
 thrust/thrust/detail/functional/actor.h       |   2 +-
 thrust/thrust/detail/functional/actor.inl     |   5 -
 .../detail/tuple_of_iterator_references.h     |  14 +--
 thrust/thrust/optional.h                      |   7 +-
 thrust/thrust/pair.h                          |  86 +++++++++++++-
 thrust/thrust/tuple.h                         | 107 ++++++++++++++++--
 10 files changed, 242 insertions(+), 69 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h
index 9269a09c745..8b17881446a 100644
--- a/libcudacxx/include/cuda/std/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/__utility/pair.h
@@ -283,18 +283,21 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
       : __base(__t1, __t2)
   {}
 
-  template <class _U1          = _T1,
-            class _U2          = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+  template <class _U1, class _U2>
+  using __pair_constructible = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>;
+
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2))
   {}
 
-  template <class _U1          = _T1,
-            class _U2          = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
@@ -316,9 +319,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
   pair(pair const&) = default;
   pair(pair&&)      = default;
 
-  template <class _U1 = _T1,
-            class _U2 = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<const _U1&, const _U2&>,
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&)
@@ -326,9 +329,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
       : __base(__p.first, __p.second)
   {}
 
-  template <class _U1 = _T1,
-            class _U2 = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<const _U1&, const _U2&>,
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&)
@@ -337,18 +340,18 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
   {}
 
   // move constructors
-  template <class _U1          = _T1,
-            class _U2          = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__p.first), _CUDA_VSTD::forward<_U2>(__p.second))
   {}
 
-  template <class _U1          = _T1,
-            class _U2          = _T2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+  template <class _U1                                                  = _T1,
+            class _U2                                                  = _T2,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
   _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
@@ -359,7 +362,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #if defined(__cuda_std__) && !defined(_CCCL_COMPILER_NVRTC)
   template <class _U1,
             class _U2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<const _U1&, const _U2&>,
+            class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
   _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(const ::std::pair<_U1, _U2>& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&)
@@ -369,7 +372,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 
   template <class _U1,
             class _U2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<const _U1&, const _U2&>,
+            class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
   _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(const ::std::pair<_U1, _U2>& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&)
@@ -379,7 +382,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 
   template <class _U1,
             class _U2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
   _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(::std::pair<_U1, _U2>&& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
@@ -388,7 +391,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 
   template <class _U1,
             class _U2,
-            class _Constraints = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>,
+            class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
   _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(::std::pair<_U1, _U2>&& __p) noexcept(
     _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2))
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 22cb1d51fe8..f1dadac04b7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -194,10 +194,6 @@ template <class... Types>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-template <class>
-struct __is_tuple_of_iterator_references : false_type
-{};
-
 // __tuple_leaf
 struct __tuple_leaf_default_constructor_tag
 {};
@@ -808,15 +804,6 @@ public:
         typename __tuple_constraints<_Tp...>::template __tuple_like_constraints<_Tuple>,
         __invalid_tuple_constraints>;
 
-  // Horrible hack to make tuple_of_iterator_references work
-  template <class _TupleOfIteratorReferences,
-            __enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0,
-            __enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(_Tp)), int>    = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t)
-      : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t).template __to_tuple<_Tp...>(
-        __make_tuple_indices_t<sizeof...(_Tp)>()))
-  {}
-
   template <class _Tuple,
             class _Constraints                                                  = __tuple_like_constraints<_Tuple>,
             __enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int>         = 0,
diff --git a/thrust/testing/pair.cu b/thrust/testing/pair.cu
index 1f1ddcf4d2e..cab025444bd 100644
--- a/thrust/testing/pair.cu
+++ b/thrust/testing/pair.cu
@@ -322,4 +322,16 @@ void TestPairStructuredBindings(void)
   ASSERT_EQUAL(b, b2);
 }
 DECLARE_UNITTEST(TestPairStructuredBindings);
-#endif
+
+void TestPairCTAD(void)
+{
+  const int a = 42;
+  const int b = 1337;
+  thrust::pair p(a, b);
+
+  auto [a2, b2] = p;
+  ASSERT_EQUAL(a, a2);
+  ASSERT_EQUAL(b, b2);
+}
+DECLARE_UNITTEST(TestPairCTAD);
+#endif // _CCCL_STD_VER >= 2017
diff --git a/thrust/testing/tuple.cu b/thrust/testing/tuple.cu
index c3cb1b23bab..879d920a011 100644
--- a/thrust/testing/tuple.cu
+++ b/thrust/testing/tuple.cu
@@ -511,7 +511,21 @@ void TestTupleStructuredBindings(void)
   ASSERT_EQUAL(c, c2);
 }
 DECLARE_UNITTEST(TestTupleStructuredBindings);
-#endif
+
+void TestTupleCTAD(void)
+{
+  const int a   = 0;
+  const char b  = 42;
+  const short c = 1337;
+  thrust::tuple t(a, b, c);
+
+  auto [a2, b2, c2] = t;
+  ASSERT_EQUAL(a, a2);
+  ASSERT_EQUAL(b, b2);
+  ASSERT_EQUAL(c, c2);
+}
+DECLARE_UNITTEST(TestTupleCTAD);
+#endif // _CCCL_STD_VER >= 2017
 
 // Ensure that we are backwards compatible with the old thrust::tuple implementation
 static_assert(
diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h
index 0dd0560e69e..3f30d0a1570 100644
--- a/thrust/thrust/detail/functional/actor.h
+++ b/thrust/thrust/detail/functional/actor.h
@@ -65,7 +65,7 @@ struct actor : Eval
 {
   typedef Eval eval_type;
 
-  _CCCL_HOST_DEVICE constexpr actor();
+  constexpr actor() = default;
 
   _CCCL_HOST_DEVICE actor(const Eval& base);
 
diff --git a/thrust/thrust/detail/functional/actor.inl b/thrust/thrust/detail/functional/actor.inl
index 81146a594c6..64d367ed15f 100644
--- a/thrust/thrust/detail/functional/actor.inl
+++ b/thrust/thrust/detail/functional/actor.inl
@@ -48,11 +48,6 @@ namespace detail
 namespace functional
 {
 
-template <typename Eval>
-_CCCL_HOST_DEVICE constexpr actor<Eval>::actor()
-    : eval_type()
-{}
-
 template <typename Eval>
 _CCCL_HOST_DEVICE actor<Eval>::actor(const Eval& base)
     : eval_type(base)
diff --git a/thrust/thrust/iterator/detail/tuple_of_iterator_references.h b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h
index 574715e4a77..bb43c9d1c4b 100644
--- a/thrust/thrust/iterator/detail/tuple_of_iterator_references.h
+++ b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h
@@ -67,9 +67,7 @@ class tuple_of_iterator_references : public thrust::tuple<Ts...>
   using super_t = thrust::tuple<Ts...>;
   using super_t::super_t;
 
-  inline _CCCL_HOST_DEVICE tuple_of_iterator_references()
-      : super_t()
-  {}
+  tuple_of_iterator_references() = default;
 
   // allow implicit construction from tuple<refs>
   inline _CCCL_HOST_DEVICE tuple_of_iterator_references(const super_t& other)
@@ -138,15 +136,15 @@ class tuple_of_iterator_references : public thrust::tuple<Ts...>
 
 } // namespace detail
 
-THRUST_NAMESPACE_END
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
 template <class... Ts>
 struct __is_tuple_of_iterator_references<THRUST_NS_QUALIFIER::detail::tuple_of_iterator_references<Ts...>>
-    : integral_constant<bool, true>
+    : _CUDA_VSTD::true_type
 {};
 
+THRUST_NAMESPACE_END
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
 // define tuple_size, tuple_element, etc.
 template <class... Ts>
 struct tuple_size<THRUST_NS_QUALIFIER::detail::tuple_of_iterator_references<Ts...>>
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index be186a9d78a..f7822324907 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -838,7 +838,12 @@ struct nullopt_t
 /// void foo (thrust::optional<int>);
 /// foo(thrust::nullopt); //pass an empty optional
 /// ```
-static constexpr nullopt_t nullopt{nullopt_t::do_not_use{}, nullopt_t::do_not_use{}};
+#ifdef __CUDA_ARCH__
+__device__ static _LIBCUDACXX_CONSTEXPR_GLOBAL
+#else
+static constexpr
+#endif // __CUDA_ARCH__
+  nullopt_t nullopt{nullopt_t::do_not_use{}, nullopt_t::do_not_use{}};
 
 class bad_optional_access : public std::exception
 {
diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h
index 4fd2e8f8333..53d5cc93edf 100644
--- a/thrust/thrust/pair.h
+++ b/thrust/thrust/pair.h
@@ -50,7 +50,7 @@ THRUST_NAMESPACE_BEGIN
  *  \tparam T A \c pair type of interest.
  */
 template <size_t N, class T>
-using tuple_element = ::cuda::std::tuple_element<N, T>;
+using tuple_element = _CUDA_VSTD::tuple_element<N, T>;
 
 /*! This convenience metafunction is included for compatibility with
  *  \p tuple. It returns \c 2, the number of elements of a \p pair,
@@ -59,7 +59,7 @@ using tuple_element = ::cuda::std::tuple_element<N, T>;
  *  \tparam Pair A \c pair type of interest.
  */
 template <class T>
-using tuple_size = ::cuda::std::tuple_size<T>;
+using tuple_size = _CUDA_VSTD::tuple_size<T>;
 
 /*! \p pair is a generic data structure encapsulating a heterogeneous
  *  pair of values.
@@ -73,10 +73,49 @@ using tuple_size = ::cuda::std::tuple_size<T>;
  *          provided by <tt>pair::second_type</tt>.
  */
 template <class T, class U>
-using pair = ::cuda::std::pair<T, U>;
-
-using ::cuda::std::get;
-using ::cuda::std::make_pair;
+struct pair : public _CUDA_VSTD::pair<T, U>
+{
+  using super_t = _CUDA_VSTD::pair<T, U>;
+  using super_t::super_t;
+
+#if (defined(_CCCL_COMPILER_GCC) && __GNUC__ < 9) || (defined(_CCCL_COMPILER_CLANG) && __clang_major__ < 12)
+  // For whatever reason nvcc complains about that constructor being used before being defined in a constexpr variable
+  constexpr pair() = default;
+
+  template <class _U1          = T,
+            class _U2          = U,
+            class _Constraints = typename _CUDA_VSTD::__pair_constraints<T, U>::template __constructible<_U1, _U2>,
+            _CUDA_VSTD::__enable_if_t<_Constraints::__implicit_constructible, int> = 0>
+  _CCCL_HOST_DEVICE constexpr pair(_U1&& __u1, _U2&& __u2)
+      : super_t(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2))
+  {}
+#endif // _CCCL_COMPILER_GCC < 9 || _CCCL_COMPILER_CLANG < 12
+};
+
+#if _CCCL_STD_VER >= 2017
+template <class _T1, class _T2>
+_CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>;
+#endif // _CCCL_STD_VER >= 2017
+
+template <class T1, class T2>
+inline _CCCL_HOST_DEVICE
+  _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__is_swappable<T1>::value && _CUDA_VSTD::__is_swappable<T2>::value, void>
+  swap(pair<T1, T2>& lhs, pair<T1, T2>& rhs) noexcept(
+    (_CUDA_VSTD::__is_nothrow_swappable<T1>::value && _CUDA_VSTD::__is_nothrow_swappable<T2>::value))
+{
+  lhs.swap(rhs);
+}
+
+template <class T1, class T2>
+inline _CCCL_HOST_DEVICE
+  pair<typename _CUDA_VSTD::__unwrap_ref_decay<T1>::type, typename _CUDA_VSTD::__unwrap_ref_decay<T2>::type>
+  make_pair(T1&& t1, T2&& t2)
+{
+  return pair<typename _CUDA_VSTD::__unwrap_ref_decay<T1>::type, typename _CUDA_VSTD::__unwrap_ref_decay<T2>::type>(
+    _CUDA_VSTD::forward<T1>(t1), _CUDA_VSTD::forward<T2>(t2));
+}
+
+using _CUDA_VSTD::get;
 
 /*! \endcond
  */
@@ -88,3 +127,38 @@ using ::cuda::std::make_pair;
  */
 
 THRUST_NAMESPACE_END
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class T1, class T2>
+struct tuple_size<THRUST_NS_QUALIFIER::pair<T1, T2>> : tuple_size<pair<T1, T2>>
+{};
+
+template <size_t Id, class T1, class T2>
+struct tuple_element<Id, THRUST_NS_QUALIFIER::pair<T1, T2>> : tuple_element<Id, pair<T1, T2>>
+{};
+
+template <class T1, class T2>
+struct __tuple_like_ext<THRUST_NS_QUALIFIER::pair<T1, T2>> : true_type
+{};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+// This is a workaround for the fact that structured bindings require that the specializations of
+// `tuple_size` and `tuple_element` reside in namespace std (https://eel.is/c++draft/dcl.struct.bind#4).
+// See https://github.com/NVIDIA/libcudacxx/issues/316 for a short discussion
+#if _CCCL_STD_VER >= 2017
+
+#  include <utility>
+
+namespace std
+{
+template <class T1, class T2>
+struct tuple_size<THRUST_NS_QUALIFIER::pair<T1, T2>> : tuple_size<pair<T1, T2>>
+{};
+
+template <size_t Id, class T1, class T2>
+struct tuple_element<Id, THRUST_NS_QUALIFIER::pair<T1, T2>> : tuple_element<Id, pair<T1, T2>>
+{};
+} // namespace std
+#endif // _CCCL_STD_VER >= 2017
diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index 5fa1d7f2f7c..4313df83fbd 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -94,28 +94,32 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&)
  *  \tparam N This parameter selects the element of interest.
  *  \tparam T A \c tuple type of interest.
  *
- *  \see pair
+ *  \see tuple
  *  \see tuple
  */
 template <size_t N, class T>
-using tuple_element = ::cuda::std::tuple_element<N, T>;
+using tuple_element = _CUDA_VSTD::tuple_element<N, T>;
 
 /*! This metafunction returns the number of elements
  *  of a \p tuple type of interest.
  *
  *  \tparam T A \c tuple type of interest.
  *
- *  \see pair
+ *  \see tuple
  *  \see tuple
  */
 template <class T>
-using tuple_size = ::cuda::std::tuple_size<T>;
+using tuple_size = _CUDA_VSTD::tuple_size<T>;
+
+template <class>
+struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type
+{};
 
 /*! \brief \p tuple is a class template that can be instantiated with up to ten
  *  arguments. Each template argument specifies the type of element in the \p
  *  tuple. Consequently, tuples are heterogeneous, fixed-size collections of
  *  values. An instantiation of \p tuple with two arguments is similar to an
- *  instantiation of \p pair with the same two arguments. Individual elements
+ *  instantiation of \p tuple with the same two arguments. Individual elements
  *  of a \p tuple may be accessed with the \p get function.
  *
  *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
@@ -143,19 +147,72 @@ using tuple_size = ::cuda::std::tuple_size<T>;
  *  }
  *  \endcode
  *
- *  \see pair
+ *  \see tuple
  *  \see get
  *  \see make_tuple
  *  \see tuple_element
  *  \see tuple_size
  *  \see tie
  */
-template <class... T>
-using tuple = ::cuda::std::tuple<T...>;
+template <class... Ts>
+struct tuple : public _CUDA_VSTD::tuple<Ts...>
+{
+  using super_t = _CUDA_VSTD::tuple<Ts...>;
+  using super_t::super_t;
+
+  tuple() = default;
+
+  template <class _TupleOfIteratorReferences,
+            _CUDA_VSTD::__enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0,
+            _CUDA_VSTD::__enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(Ts)), int>     = 0>
+  _CCCL_HOST_DEVICE tuple(_TupleOfIteratorReferences&& tup)
+      : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(tup).template __to_tuple<Ts...>(
+        _CUDA_VSTD::__make_tuple_indices_t<sizeof...(Ts)>()))
+  {}
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <class TupleLike,
+            _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__tuple_assignable<TupleLike, super_t>::value, int> = 0>
+  _CCCL_HOST_DEVICE tuple& operator=(TupleLike&& other)
+  {
+    super_t::operator=(_CUDA_VSTD::forward<TupleLike>(other));
+    return *this;
+  }
+};
+
+#if _CCCL_STD_VER >= 2017
+template <class... Ts>
+_CCCL_HOST_DEVICE tuple(Ts...) -> tuple<Ts...>;
+
+template <class T1, class T2>
+struct pair;
+
+template <class T1, class T2>
+_CCCL_HOST_DEVICE tuple(pair<T1, T2>) -> tuple<T1, T2>;
+#endif // _CCCL_STD_VER >= 2017
+
+template <class... Ts>
+inline _CCCL_HOST_DEVICE
+  _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__all<_CUDA_VSTD::__is_swappable<Ts>::value...>::value, void>
+  swap(tuple<Ts...>& __x,
+       tuple<Ts...>& __y) noexcept((_CUDA_VSTD::__all<_CUDA_VSTD::__is_nothrow_swappable<Ts>::value...>::value))
+{
+  __x.swap(__y);
+}
+
+template <class... Ts>
+inline _CCCL_HOST_DEVICE tuple<typename _CUDA_VSTD::__unwrap_ref_decay<Ts>::type...> make_tuple(Ts&&... __t)
+{
+  return tuple<typename _CUDA_VSTD::__unwrap_ref_decay<Ts>::type...>(_CUDA_VSTD::forward<Ts>(__t)...);
+}
 
-using ::cuda::std::get;
-using ::cuda::std::make_tuple;
-using ::cuda::std::tie;
+template <class... Ts>
+inline _CCCL_HOST_DEVICE tuple<Ts&...> tie(Ts&... ts) noexcept
+{
+  return tuple<Ts&...>(ts...);
+}
+
+using _CUDA_VSTD::get;
 
 /*! \endcond
  */
@@ -170,6 +227,18 @@ THRUST_NAMESPACE_END
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+template <class... Ts>
+struct tuple_size<THRUST_NS_QUALIFIER::tuple<Ts...>> : tuple_size<tuple<Ts...>>
+{};
+
+template <size_t Id, class... Ts>
+struct tuple_element<Id, THRUST_NS_QUALIFIER::tuple<Ts...>> : tuple_element<Id, tuple<Ts...>>
+{};
+
+template <class... Ts>
+struct __tuple_like_ext<THRUST_NS_QUALIFIER::tuple<Ts...>> : true_type
+{};
+
 template <>
 struct tuple_size<tuple<THRUST_NS_QUALIFIER::null_type,
                         THRUST_NS_QUALIFIER::null_type,
@@ -278,3 +347,19 @@ struct tuple_size<tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, THRUST_NS_QUALIFIER:
 {};
 
 _LIBCUDACXX_END_NAMESPACE_STD
+
+// This is a workaround for the fact that structured bindings require that the specializations of
+// `tuple_size` and `tuple_element` reside in namespace std (https://eel.is/c++draft/dcl.struct.bind#4).
+// See https://github.com/NVIDIA/libcudacxx/issues/316 for a short discussion
+#if _CCCL_STD_VER >= 2017
+namespace std
+{
+template <class... Ts>
+struct tuple_size<THRUST_NS_QUALIFIER::tuple<Ts...>> : tuple_size<tuple<Ts...>>
+{};
+
+template <size_t Id, class... Ts>
+struct tuple_element<Id, THRUST_NS_QUALIFIER::tuple<Ts...>> : tuple_element<Id, tuple<Ts...>>
+{};
+} // namespace std
+#endif // _CCCL_STD_VER >= 2017

From fe3a98bf844600cb078343ab3b010379f6f2ca0a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 2 May 2024 19:03:00 +0200
Subject: [PATCH 7/9] Add workaround for MSVC2017

---
 thrust/thrust/tuple.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index 4313df83fbd..f8a77850174 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -40,6 +40,7 @@
 #endif // no system header
 
 #include <cuda/std/tuple>
+#include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
 #include <tuple>
@@ -178,6 +179,23 @@ struct tuple : public _CUDA_VSTD::tuple<Ts...>
     super_t::operator=(_CUDA_VSTD::forward<TupleLike>(other));
     return *this;
   }
+
+#if defined(_CCCL_COMPILER_MSVC_2017)
+  // MSVC2017 needs some help to convert tuples
+  template <class... Us,
+            _CUDA_VSTD::__enable_if_t<!_CUDA_VSTD::is_same<tuple<Us...>, tuple>::value, int> = 0,
+            _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__tuple_convertible<_CUDA_VSTD::tuple<Us...>, super_t>::value, int> = 0>
+  _CCCL_HOST_DEVICE constexpr operator tuple<Us...>()
+  {
+    return __to_tuple<Us...>(typename _CUDA_VSTD::__make_tuple_indices<sizeof...(Ts)>::type{});
+  }
+
+  template <class... Us, size_t... Id>
+  _CCCL_HOST_DEVICE constexpr tuple<Us...> __to_tuple(_CUDA_VSTD::__tuple_indices<Id...>) const
+  {
+    return tuple<Us...>{_CUDA_VSTD::get<Id>(*this)...};
+  }
+#endif // _CCCL_COMPILER_MSVC_2017
 };
 
 #if _CCCL_STD_VER >= 2017

From 2eea114e37763af5718129de9d6961e8f70ab269 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 3 May 2024 16:39:30 +0200
Subject: [PATCH 8/9] Fix docs

---
 thrust/thrust/tuple.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index f8a77850174..9a69a3b1c2b 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -95,7 +95,7 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&)
  *  \tparam N This parameter selects the element of interest.
  *  \tparam T A \c tuple type of interest.
  *
- *  \see tuple
+ *  \see pair
  *  \see tuple
  */
 template <size_t N, class T>
@@ -120,7 +120,7 @@ struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type
  *  arguments. Each template argument specifies the type of element in the \p
  *  tuple. Consequently, tuples are heterogeneous, fixed-size collections of
  *  values. An instantiation of \p tuple with two arguments is similar to an
- *  instantiation of \p tuple with the same two arguments. Individual elements
+ *  instantiation of \p pair with the same two arguments. Individual elements
  *  of a \p tuple may be accessed with the \p get function.
  *
  *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
@@ -148,7 +148,7 @@ struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type
  *  }
  *  \endcode
  *
- *  \see tuple
+ *  \see pair
  *  \see get
  *  \see make_tuple
  *  \see tuple_element

From a85236d934bf7fd4ea4041a0ddc81c5d053ac28c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 6 May 2024 08:44:13 +0200
Subject: [PATCH 9/9] Fix comment

---
 thrust/thrust/tuple.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index 9a69a3b1c2b..701da426b3d 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -106,7 +106,7 @@ using tuple_element = _CUDA_VSTD::tuple_element<N, T>;
  *
  *  \tparam T A \c tuple type of interest.
  *
- *  \see tuple
+ *  \see pair
  *  \see tuple
  */
 template <class T>