diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
new file mode 100644
index 0000000000..4316b3604f
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+template <typename ... _Ty>
+__device__ inline bool __unused(_Ty...) { return true; }
+
+__global__ void test_compilation() {
+  using cuda::ptx::sem_release;
+  using cuda::ptx::space_cluster;
+  using cuda::ptx::space_shared;
+  using cuda::ptx::scope_cluster;
+  using cuda::ptx::scope_cta;
+
+  __shared__ uint64_t bar;
+  bar = 1;
+  uint64_t state = 1;
+
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80, (
+    state = cuda::ptx::mbarrier_arrive(&bar);                                                        // 1.
+    state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1);                                         // 5.
+  ));
+#endif // __cccl_ptx_isa >= 700
+
+  // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(&bar, 1);                                                     // 2.
+  ));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar);                  // 3a.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);              // 3a.
+
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1);               // 3b.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 3b.
+
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar);                     // 4a.
+
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 4b.
+
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);     // 8.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.
+
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 9.
+  ));
+#endif // __cccl_ptx_isa >= 800
+  __unused(bar, state);
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
index d0d3163c88..33b338ff71 100644
--- a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
+++ b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
@@ -19,6 +19,8 @@
     #endif
 #endif
 
+#include <cuda/std/cassert>
+
 #include "test_macros.h"
 
 TEST_EXEC_CHECK_DISABLE
diff --git a/libcudacxx/docs/extended_api.md b/libcudacxx/docs/extended_api.md
index 952b7c81e5..6f71683edc 100644
--- a/libcudacxx/docs/extended_api.md
+++ b/libcudacxx/docs/extended_api.md
@@ -21,6 +21,8 @@ nav_order: 3
 
 {% include_relative extended_api/functional.md %}
 
+{% include_relative extended_api/ptx.md %}
+
 [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
 [Thread Groups]: ./extended_api/thread_groups.md
 
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
new file mode 100644
index 0000000000..e45eed54a4
--- /dev/null
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -0,0 +1,669 @@
+## PTX instructions
+
+The `cuda::ptx` namespace contains functions that map one-to-one to
+[PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
+experiment with new hardware features before a high-level C++ API is available.
+
+### Variants
+
+### Versions and compatibility
+
+The `cuda/ptx` header is intended to present a stable API (not ABI) within one
+major version of the CTK on a best effort basis. This means that:
+
+- All functions are marked static inline.
+
+- The type of a function parameter can be changed to be more generic if
+  that means that code that called the original version can still be
+  compiled.
+
+- Good exposure of the PTX should be high priority. If, at a new major
+  version, we face a difficult choice between breaking backward-compatibility
+  and an improvement of the PTX exposure, we will tend to the latter option
+  more easily than in other parts of libcu++.
+
+API stability is not taken to the extreme. Call functions like below to ensure
+forward-compatibility:
+
+```cuda
+// Use arguments to drive overload resolution:
+cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+
+// Specifying templates directly is not forward-compatible, as order and number
+// of template parameters may change in a minor release:
+cuda::ptx::mbarrier_arrive_expect_tx<cuda::ptx::sem_release_t>(
+  cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1
+);
+```
+
+**PTX ISA version and compute capability.** Each binding notes under which PTX
+ISA version and SM version it may be used. Example:
+
+```cuda
+// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr);
+```
+
+To check if the current compiler is recent enough, use:
+```cuda
+#if __cccl_ptx_isa >= 700
+cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+#endif
+```
+
+Ensure that you only call the function when compiling for a recent enough
+compute capability (SM version), like this:
+```cuda
+NV_IF_TARGET(NV_PROVIDES_SM_80,(
+  cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+));
+```
+
+For more information on which compilers correspond to which PTX ISA, see the
+[PTX ISA release
+notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes).
+
+
+### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`sad`]                                  | No                   |
+| [`div`]                                  | No                   |
+| [`rem`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`popc`]                                 | No                   |
+| [`clz`]                                  | No                   |
+| [`bfind`]                                | No                   |
+| [`fns`]                                  | No                   |
+| [`brev`]                                 | No                   |
+| [`bfe`]                                  | No                   |
+| [`bfi`]                                  | No                   |
+| [`szext`]                                | No                   |
+| [`bmsk`]                                 | No                   |
+| [`dp4a`]                                 | No                   |
+| [`dp2a`]                                 | No                   |
+
+[`sad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
+[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
+[`rem`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
+[`popc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+[`clz`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+[`bfind`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
+[`fns`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+[`brev`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+[`bfe`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+[`bfi`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+[`szext`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
+[`bmsk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
+[`dp4a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
+[`dp2a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
+
+### [9.7.2. Extended-Precision Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`add.cc`]                               | No                   |
+| [`addc`]                                 | No                   |
+| [`sub.cc`]                               | No                   |
+| [`subc`]                                 | No                   |
+| [`mad.cc`]                               | No                   |
+| [`madc`]                                 | No                   |
+
+[`add.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
+[`addc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
+[`sub.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
+[`subc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
+[`mad.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
+[`madc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
+
+### [9.7.3. Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`testp`]                                | No                   |
+| [`copysign`]                             | No                   |
+| [`add`]                                  | No                   |
+| [`sub`]                                  | No                   |
+| [`mul`]                                  | No                   |
+| [`fma`]                                  | No                   |
+| [`mad`]                                  | No                   |
+| [`div`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`rcp`]                                  | No                   |
+| [`rcp.approx.ftz.f64`]                   | No                   |
+| [`sqrt`]                                 | No                   |
+| [`rsqrt`]                                | No                   |
+| [`rsqrt.approx.ftz.f64`]                 | No                   |
+| [`sin`]                                  | No                   |
+| [`cos`]                                  | No                   |
+| [`lg2`]                                  | No                   |
+| [`ex2`]                                  | No                   |
+| [`tanh`]                                 | No                   |
+
+[`testp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
+[`copysign`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
+[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
+[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
+[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+[`mad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
+[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
+[`rcp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+[`rcp.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+[`sqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+[`rsqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+[`rsqrt.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+[`sin`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+[`cos`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+[`lg2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
+
+### [9.7.4. Half Precision Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`add`]                                  | No                   |
+| [`sub`]                                  | No                   |
+| [`mul`]                                  | No                   |
+| [`fma`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`tanh`]                                 | No                   |
+| [`ex2`]                                  | No                   |
+
+[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
+[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
+[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
+[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
+[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+
+### [9.7.5. Comparison and Selection Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`set`]                                  | No                   |
+| [`setp`]                                 | No                   |
+| [`selp`]                                 | No                   |
+| [`slct`]                                 | No                   |
+
+[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
+[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
+[`selp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+[`slct`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
+
+### [9.7.6. Half Precision Comparison Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`set`]                                  | No                   |
+| [`setp`]                                 | No                   |
+
+[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
+[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
+
+### [9.7.7. Logic and Shift Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`and`]                                  | No                   |
+| [`or`]                                   | No                   |
+| [`xor`]                                  | No                   |
+| [`not`]                                  | No                   |
+| [`cnot`]                                 | No                   |
+| [`lop3`]                                 | No                   |
+| [`shf`]                                  | No                   |
+| [`shl`]                                  | No                   |
+| [`shr`]                                  | No                   |
+
+[`and`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
+[`or`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
+[`xor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+[`not`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
+[`cnot`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
+[`lop3`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
+[`shf`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
+[`shl`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
+[`shr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
+
+### [9.7.8. Data Movement and Conversion Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`mov`]                                  | No                   |
+| [`mov`]                                  | No                   |
+| [`shfl (deprecated)`]                    | No                   |
+| [`shfl.sync`]                            | No                   |
+| [`prmt`]                                 | No                   |
+| [`ld`]                                   | No                   |
+| [`ld.global.nc`]                         | No                   |
+| [`ldu`]                                  | No                   |
+| [`st`]                                   | No                   |
+| [`st.async`]                             | No                   |
+| [`multimem.ld_reduce, multimem.st, multimem.red`] | No                   |
+| [`prefetch, prefetchu`]                  | No                   |
+| [`applypriority`]                        | No                   |
+| [`discard`]                              | No                   |
+| [`createpolicy`]                         | No                   |
+| [`isspacep`]                             | No                   |
+| [`cvta`]                                 | No                   |
+| [`cvt`]                                  | No                   |
+| [`cvt.pack`]                             | No                   |
+| [`mapa`]                                 | No                   |
+| [`getctarank`]                           | No                   |
+
+[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
+[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
+[`shfl.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
+[`prmt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+[`ld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+[`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
+[`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
+[`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
+[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+[`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+[`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+[`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
+[`discard`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
+[`createpolicy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
+[`isspacep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
+[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
+[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
+[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
+[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`cp.async`]                             | No                   |
+| [`cp.async.commit_group`]                | No                   |
+| [`cp.async.wait_group / cp.async.wait_all`] | No                   |
+| [`cp.async.bulk`]                        | No                   |
+| [`cp.reduce.async.bulk`]                 | No                   |
+| [`cp.async.bulk.prefetch`]               | No                   |
+| [`cp.async.bulk.tensor`]                 | No                   |
+| [`cp.reduce.async.bulk.tensor`]          | No                   |
+| [`cp.async.bulk.prefetch.tensor`]        | No                   |
+| [`cp.async.bulk.commit_group`]           | No                   |
+| [`cp.async.bulk.wait_group`]             | No                   |
+| [`tensormap.replace`]                    | No                   |
+
+[`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+[`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+[`cp.async.wait_group / cp.async.wait_all`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
+[`cp.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+[`cp.reduce.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+[`cp.async.bulk.prefetch`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
+[`cp.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+[`cp.reduce.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+[`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+[`cp.async.bulk.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+[`cp.async.bulk.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+
+### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`tex`]                                  | No                   |
+| [`tld4`]                                 | No                   |
+| [`txq`]                                  | No                   |
+| [`istypep`]                              | No                   |
+
+[`tex`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+[`tld4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
+[`txq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
+[`istypep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
+
+### [9.7.10. Surface Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`suld`]                                 | No                   |
+| [`sust`]                                 | No                   |
+| [`sured`]                                | No                   |
+| [`suq`]                                  | No                   |
+
+[`suld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
+[`sust`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
+[`sured`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
+[`suq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
+
+### [9.7.11. Control Flow Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`{}`]                                   | No                   |
+| [`@`]                                    | No                   |
+| [`bra`]                                  | No                   |
+| [`brx.idx`]                              | No                   |
+| [`call`]                                 | No                   |
+| [`ret`]                                  | No                   |
+| [`exit`]                                 | No                   |
+
+[`{}`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
+[`@`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
+[`bra`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
+[`brx.idx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
+[`call`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
+[`ret`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
+[`exit`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
+
+### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`bar, barrier`]                         | No                   |
+| [`bar.warp.sync`]                        | No                   |
+| [`barrier.cluster`]                      | No                   |
+| [`membar/fence`]                         | No                   |
+| [`atom`]                                 | No                   |
+| [`red`]                                  | No                   |
+| [`red.async`]                            | No                   |
+| [`vote (deprecated)`]                    | No                   |
+| [`vote.sync`]                            | No                   |
+| [`match.sync`]                           | No                   |
+| [`activemask`]                           | No                   |
+| [`redux.sync`]                           | No                   |
+| [`griddepcontrol`]                       | No                   |
+| [`elect.sync`]                           | No                   |
+
+[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
+[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
+[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+[`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+[`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
+[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+[`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
+[`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
+[`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
+[`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+[`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
+[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+
+### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`mbarrier.init`]                        | No                   |
+| [`mbarrier.inval`]                       | No                   |
+| [`mbarrier.expect_tx`]                   | No                   |
+| [`mbarrier.complete_tx`]                 | No                   |
+| [`mbarrier.arrive`]                      | CTK-FUTURE, CCCL v2.3.0 |
+| [`mbarrier.arrive_drop`]                 | No                   |
+| [`cp.async.mbarrier.arrive`]             | No                   |
+| [`mbarrier.test_wait/mbarrier.try_wait`] | No                   |
+| [`mbarrier.pending_count`]               | No                   |
+| [`tensormap.cp_fenceproxy`]              | No                   |
+
+[`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
+[`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
+[`mbarrier.expect_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
+[`mbarrier.complete_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
+[`mbarrier.arrive`]: #mbarrierarrive
+[`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
+[`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
+[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+[`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
+[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+
+
+#### `mbarrier.arrive`
+
+-  PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+
+```cuda
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr);
+
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr,
+  const uint32_t& count);
+
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr);
+
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& count);
+
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& count);
+```
+
+```cuda
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+```
+
+```cuda
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+```
+
+Usage:
+```cuda
+#include <cuda/ptx>
+#include <cuda/barrier>
+#include <cooperative_groups.h>
+
+__global__ void kernel() {
+    using cuda::ptx::sem_release;
+    using cuda::ptx::space_cluster;
+    using cuda::ptx::space_shared;
+    using cuda::ptx::scope_cluster;
+    using cuda::ptx::scope_cta;
+
+    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+    __shared__ barrier_t bar;
+    init(&bar, blockDim.x);
+    __syncthreads();
+
+    NV_IF_TARGET(NV_PROVIDES_SM_90, (
+        // Arrive on local shared memory barrier:
+        uint64_t token;
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+        // Get address of remote cluster barrier:
+        namespace cg = cooperative_groups;
+        cg::cluster_group cluster = cg::this_cluster();
+        unsigned int other_block_rank = cluster.block_rank() ^ 1;
+        uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+
+        // Sync cluster to ensure remote barrier is initialized.
+        cluster.sync();
+
+        // Arrive on remote cluster barrier:
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, remote_bar, 1);
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
+    )
+}
+```
+### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`wmma.load`]                            | No                   |
+| [`wmma.store`]                           | No                   |
+| [`wmma.mma`]                             | No                   |
+| [`mma`]                                  | No                   |
+| [`ldmatrix`]                             | No                   |
+| [`stmatrix`]                             | No                   |
+| [`movmatrix`]                            | No                   |
+| [`mma.sp`]                               | No                   |
+
+[`wmma.load`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
+[`wmma.store`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
+[`wmma.mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
+[`mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
+[`ldmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
+[`stmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
+[`movmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
+[`mma.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
+
+### [9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`wgmma.mma_async`]                      | No                   |
+| [`wgmma.mma_async.sp`]                   | No                   |
+| [`wgmma.fence`]                          | No                   |
+| [`wgmma.commit_group`]                   | No                   |
+| [`wgmma.wait_group`]                     | No                   |
+
+[`wgmma.mma_async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
+[`wgmma.mma_async.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
+[`wgmma.fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
+[`wgmma.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
+[`wgmma.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
+
+### [9.7.15. Stack Manipulation Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`stacksave`]                            | No                   |
+| [`stackrestore`]                         | No                   |
+| [`alloca`]                               | No                   |
+
+[`stacksave`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
+[`stackrestore`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
+[`alloca`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
+
+### [9.7.16. Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`vadd, vsub, vabsdiff, vmin, vmax`]     | No                   |
+| [`vshl, vshr`]                           | No                   |
+| [`vmad`]                                 | No                   |
+| [`vset`]                                 | No                   |
+
+[`vadd, vsub, vabsdiff, vmin, vmax`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
+[`vshl, vshr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
+[`vmad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
+[`vset`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
+
+### [9.7.16.2. SIMD Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`] | No                   |
+| [`vset2`]                                | No                   |
+| [`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`] | No                   |
+| [`vset4`]                                | No                   |
+
+[`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
+[`vset2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
+[`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
+[`vset4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
+
+### [9.7.17. Miscellaneous Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`brkpt`]                                | No                   |
+| [`nanosleep`]                            | No                   |
+| [`pmevent`]                              | No                   |
+| [`trap`]                                 | No                   |
+| [`setmaxnreg`]                           | No                   |
+
+[`brkpt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
+[`nanosleep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
+[`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
+[`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
+[`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
new file mode 100644
index 0000000000..ab6ed62d9d
--- /dev/null
+++ b/libcudacxx/include/cuda/ptx
@@ -0,0 +1,23 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX
+#define _CUDA_PTX
+
+#include "std/detail/__config"
+
+#include "std/detail/__pragma_push"
+
+#include "std/detail/libcxx/include/__cuda/ptx.h"
+
+#include "std/detail/__pragma_pop"
+
+#endif // _CUDA_PTX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index f71aa62324..bc0398eddf 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -1505,6 +1505,9 @@ typedef __char32_t char32_t;
 #define _LIBCUDACXX_END_NAMESPACE_CUDA  } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
+#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
+#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX  } } }
+#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL  } } } }
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 115e57a27d..c2ed62c92f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -27,6 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include "../cstdlib"                // _LIBCUDACXX_UNREACHABLE
 #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
+#include "../__cuda/ptx.h"           // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
@@ -206,29 +207,21 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 else if (!__isShared(&__barrier)) {
                     __trap();
                 }
-
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                    "r"(static_cast<_CUDA_VSTD::uint32_t>(__update))
-                    : "memory");
+                // Cannot use cuda::device::barrier_native_handle here, as it is
+                // only defined for block-scope barriers. This barrier may be a
+                // non-block scoped barrier.
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
+                __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);
             ), NV_PROVIDES_SM_80, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
                 }
-
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
                 // Need 2 instructions, can't finish barrier with arrive > 1
                 if (__update > 1) {
-                    asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
-                        : "=l"(__token)
-                        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                            "r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1))
-                        : "memory");
+                    _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1);
                 }
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier)))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive( __bh);
             ), NV_IS_DEVICE, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
@@ -603,14 +596,12 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
             // us in release builds. In debug builds, the error would be caught
             // by the asserts at the top of this function.
 
-            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+            auto __native_handle = barrier_native_handle(__b);
+            auto __bh = __cvta_generic_to_shared(__native_handle);
             if (__arrive_count_update == 1) {
-                asm (
-                    "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update
+                );
             } else {
                 asm (
                     "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
@@ -618,12 +609,9 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
                     : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
                     : "memory");
-                asm (
-                    "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
+                );
             }
         )
     );
@@ -1089,8 +1077,8 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
     ));
 
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
-        const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
+            const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
             if (__can_use_async_group) {
                 __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
                 return __completion_mechanism::__async_group;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
new file mode 100644
index 0000000000..384f3ba14b
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -0,0 +1,719 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CUDA_PTX_H
+#define  _LIBCUDACXX___CUDA_PTX_H
+
+#ifndef __cuda_std__
+#error "<__cuda/ptx.h> should only be included in from <cuda/std/barrier>"
+#endif // __cuda_std__
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+#include "../__cuda/ptx/ptx_isa_target_macros.h"
+#include "../__cuda/ptx/ptx_dot_variants.h"
+#include "../__cuda/ptx/ptx_helper_functions.h"
+#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
+#include "../cstdint" // uint32_t
+
+/*
+ * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
+ * features and new PTX instructions so that they can be experimented with
+ * before higher-level C++ APIs are designed and developed.
+ *
+ * The wrappers have the following responsibilities:
+ *
+ * - They must prevent any PTX assembler errors, that is:
+ *   - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas
+ *     actually recognizes the instruction.
+ *   - Sizes and types of parameters are correct.
+ * - They must convert state spaces correctly.
+ * - They adhere to the libcu++ coding standards of using:
+ *   - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof`
+ *   - _CUDA_VSTD:: namespace for types
+ *
+ * The wrappers should not do the following:
+ *
+ * - Use any non-native types. For example, an mbarrier instruction wrapper
+ *   takes the barrier address as a uint64_t pointer.
+ *
+ * This header is intended for:
+ *
+ * - internal consumption by higher-level APIs such as cuda::barrier,
+ * - outside developers who want to experiment with the latest features of the
+ *   hardware.
+ *
+ * Stability:
+ *
+ * - These headers are intended to present a stable API (not ABI) within one
+ *   major version of the CTK. This means that:
+ *   - All functions are marked inline
+ *   - The type of a function parameter can be changed to be more generic if
+ *     that means that code that called the original version can still be
+ *     compiled.
+ *
+ * - Good exposure of the PTX should be high priority. If, at a new major
+ *   version, we face a difficult choice between breaking backward-compatibility
+ *   and an improvement of the PTX exposure, we will tend to the latter option
+ *   more easily than in other parts of libcu++.
+ */
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+/*
+ *  Instructions
+ *
+ *  The organization of the instructions below follows that of the PTX ISA documentation:
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions
+ *
+ *  To improve code organization, some sections are separated into their own
+ *  header. For instance, the mbarrier instructions are found in:
+ *  __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+ *
+ */
+
+/*
+ *  9.7.1. Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions
+ *
+ */
+
+// 9.7.1.7. Integer Arithmetic Instructions: sad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
+
+// 9.7.1.8. Integer Arithmetic Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
+
+// 9.7.1.9. Integer Arithmetic Instructions: rem
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
+
+// 9.7.1.10. Integer Arithmetic Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
+
+// 9.7.1.11. Integer Arithmetic Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
+
+// 9.7.1.12. Integer Arithmetic Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
+
+// 9.7.1.13. Integer Arithmetic Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
+
+// 9.7.1.14. Integer Arithmetic Instructions: popc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+
+// 9.7.1.15. Integer Arithmetic Instructions: clz
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+
+// 9.7.1.16. Integer Arithmetic Instructions: bfind
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
+
+// 9.7.1.17. Integer Arithmetic Instructions: fns
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+
+// 9.7.1.18. Integer Arithmetic Instructions: brev
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+
+// 9.7.1.19. Integer Arithmetic Instructions: bfe
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+
+// 9.7.1.20. Integer Arithmetic Instructions: bfi
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+
+// 9.7.1.21. Integer Arithmetic Instructions: szext
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
+
+// 9.7.1.22. Integer Arithmetic Instructions: bmsk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
+
+// 9.7.1.23. Integer Arithmetic Instructions: dp4a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
+
+// 9.7.1.24. Integer Arithmetic Instructions: dp2a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
+
+
+/*
+ *  9.7.2. Extended-Precision Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions
+ *
+ */
+
+// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
+
+// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
+
+// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
+
+// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
+
+// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
+
+// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
+
+
+/*
+ *  9.7.3. Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions
+ *
+ */
+
+// 9.7.3.1. Floating Point Instructions: testp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
+
+// 9.7.3.2. Floating Point Instructions: copysign
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
+
+// 9.7.3.3. Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
+
+// 9.7.3.4. Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+
+// 9.7.3.5. Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
+
+// 9.7.3.6. Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+
+// 9.7.3.7. Floating Point Instructions: mad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
+
+// 9.7.3.8. Floating Point Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+
+// 9.7.3.9. Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
+
+// 9.7.3.10. Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+
+// 9.7.3.11. Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
+
+// 9.7.3.12. Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
+
+// 9.7.3.13. Floating Point Instructions: rcp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+
+// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+
+// 9.7.3.15. Floating Point Instructions: sqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+
+// 9.7.3.16. Floating Point Instructions: rsqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+
+// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+
+// 9.7.3.18. Floating Point Instructions: sin
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+
+// 9.7.3.19. Floating Point Instructions: cos
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+
+// 9.7.3.20. Floating Point Instructions: lg2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+
+// 9.7.3.21. Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+
+// 9.7.3.22. Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
+
+
+/*
+ *  9.7.4. Half Precision Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions
+ *
+ */
+
+// 9.7.4.1. Half Precision Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
+
+// 9.7.4.2. Half Precision Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+
+// 9.7.4.3. Half Precision Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
+
+// 9.7.4.4. Half Precision Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+
+// 9.7.4.5. Half Precision Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+
+// 9.7.4.6. Half Precision Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
+
+// 9.7.4.7. Half Precision Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+
+// 9.7.4.8. Half Precision Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
+
+// 9.7.4.9. Half Precision Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
+
+// 9.7.4.10. Half Precision Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+
+
+/*
+ *  9.7.5. Comparison and Selection Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions
+ *
+ */
+
+// 9.7.5.1. Comparison and Selection Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
+
+// 9.7.5.2. Comparison and Selection Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
+
+// 9.7.5.3. Comparison and Selection Instructions: selp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+
+// 9.7.5.4. Comparison and Selection Instructions: slct
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
+
+
+/*
+ *  9.7.6. Half Precision Comparison Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions
+ *
+ */
+
+// 9.7.6.1. Half Precision Comparison Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
+
+// 9.7.6.2. Half Precision Comparison Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
+
+
+/*
+ *  9.7.7. Logic and Shift Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions
+ *
+ */
+
+// 9.7.7.1. Logic and Shift Instructions: and
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
+
+// 9.7.7.2. Logic and Shift Instructions: or
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
+
+// 9.7.7.3. Logic and Shift Instructions: xor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+
+// 9.7.7.4. Logic and Shift Instructions: not
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
+
+// 9.7.7.5. Logic and Shift Instructions: cnot
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
+
+// 9.7.7.6. Logic and Shift Instructions: lop3
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
+
+// 9.7.7.7. Logic and Shift Instructions: shf
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
+
+// 9.7.7.8. Logic and Shift Instructions: shl
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
+
+// 9.7.7.9. Logic and Shift Instructions: shr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
+
+
+/*
+ *  9.7.8. Data Movement and Conversion Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions
+ *
+ */
+
+// 9.7.8.3. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
+
+// 9.7.8.4. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
+
+// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
+
+// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
+
+// 9.7.8.7. Data Movement and Conversion Instructions: prmt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+
+// 9.7.8.8. Data Movement and Conversion Instructions: ld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+
+// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
+
+// 9.7.8.10. Data Movement and Conversion Instructions: ldu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
+
+// 9.7.8.11. Data Movement and Conversion Instructions: st
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
+
+// 9.7.8.12. Data Movement and Conversion Instructions: st.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+
+// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+
+// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+
+// 9.7.8.15. Data Movement and Conversion Instructions: applypriority
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
+
+// 9.7.8.16. Data Movement and Conversion Instructions: discard
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
+
+// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
+
+// 9.7.8.18. Data Movement and Conversion Instructions: isspacep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
+
+// 9.7.8.19. Data Movement and Conversion Instructions: cvta
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
+
+// 9.7.8.20. Data Movement and Conversion Instructions: cvt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
+
+// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
+
+// 9.7.8.22. Data Movement and Conversion Instructions: mapa
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+
+// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+
+/*
+ *  9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy
+ *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
+ *
+ */
+
+// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+
+// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
+
+// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+
+// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
+
+// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+
+// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+
+// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+
+// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+
+// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+
+// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+
+
+/*
+ *  9.7.9. Texture Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions
+ *
+ */
+
+// 9.7.9.3. Texture Instructions: tex
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+
+// 9.7.9.4. Texture Instructions: tld4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
+
+// 9.7.9.5. Texture Instructions: txq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
+
+// 9.7.9.6. Texture Instructions: istypep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
+
+
+/*
+ *  9.7.10. Surface Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions
+ *
+ */
+
+// 9.7.10.1. Surface Instructions: suld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
+
+// 9.7.10.2. Surface Instructions: sust
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
+
+// 9.7.10.3. Surface Instructions: sured
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
+
+// 9.7.10.4. Surface Instructions: suq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
+
+
+/*
+ *  9.7.11. Control Flow Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions
+ *
+ */
+
+// 9.7.11.1. Control Flow Instructions: {}
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
+
+// 9.7.11.2. Control Flow Instructions: @
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
+
+// 9.7.11.3. Control Flow Instructions: bra
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
+
+// 9.7.11.4. Control Flow Instructions: brx.idx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
+
+// 9.7.11.5. Control Flow Instructions: call
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
+
+// 9.7.11.6. Control Flow Instructions: ret
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
+
+// 9.7.11.7. Control Flow Instructions: exit
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
+
+
+/*
+ *  9.7.12. Parallel Synchronization and Communication Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions
+ *
+ */
+
+// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
+
+// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
+
+// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+
+// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+
+// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+
+// 9.7.12.6. Parallel Synchronization and Communication Instructions: red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
+
+// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+
+// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
+
+// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
+
+// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
+
+// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+
+// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
+
+// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+
+// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+
+/*
+ *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
+ *
+ *  Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+ */
+
+// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+
+
+/*
+ *  9.7.13. Warp Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions
+ *
+ */
+
+// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
+
+// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
+
+// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
+
+// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
+
+// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
+
+// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
+
+// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
+
+// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
+
+
+/*
+ *  9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions
+ *
+ */
+
+// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
+
+// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
+
+// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
+
+// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
+
+// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
+
+
+/*
+ *  9.7.15. Stack Manipulation Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions
+ *
+ */
+
+// 9.7.15.1. Stack Manipulation Instructions: stacksave
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
+
+// 9.7.15.2. Stack Manipulation Instructions: stackrestore
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
+
+// 9.7.15.3. Stack Manipulation Instructions: alloca
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
+
+
+/*
+ *  9.7.16. Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions
+ *
+ */
+
+// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
+
+// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
+
+// 9.7.16.1.3. Scalar Video Instructions: vmad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
+
+// 9.7.16.1.4. Scalar Video Instructions: vset
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
+
+
+/*
+ *  9.7.16.2. SIMD Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions
+ *
+ */
+
+// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
+
+// 9.7.16.2.2. SIMD Video Instructions: vset2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
+
+// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
+
+// 9.7.16.2.4. SIMD Video Instructions: vset4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
+
+
+/*
+ *  9.7.17. Miscellaneous Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions
+ *
+ */
+
+// 9.7.17.1. Miscellaneous Instructions: brkpt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
+
+// 9.7.17.2. Miscellaneous Instructions: nanosleep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
+
+// 9.7.17.3. Miscellaneous Instructions: pmevent
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
+
+// 9.7.17.4. Miscellaneous Instructions: trap
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
+
+// 9.7.17.5. Miscellaneous Instructions: setmaxnreg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _LIBCUDACXX___CUDA_PTX_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
new file mode 100644
index 0000000000..39bab14041
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -0,0 +1,514 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+#define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+
+#include "ptx_dot_variants.h"
+#include "ptx_helper_functions.h"
+#include "ptx_isa_target_macros.h"
+#include "../../cstdint"
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+/*
+ *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
+ *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
+ *
+ */
+
+// 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
+
+// 9.7.12.15.10. Parallel Synchronization and Communication Instructions: mbarrier.inval
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
+
+// 9.7.12.15.11. Parallel Synchronization and Communication Instructions: mbarrier.expect_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
+
+// 9.7.12.15.12. Parallel Synchronization and Communication Instructions: mbarrier.complete_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
+
+// 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+
+/*
+PTX ISA docs:
+
+// mbarrier.arrive:
+mbarrier.arrive{.shared}.b64 state, [addr];                                         // 1. PTX ISA 70, SM_80
+mbarrier.arrive{.shared{::cta}}.b64 state, [addr]{, count};                         // 2. PTX ISA 78, SM_90 (due to count)
+
+mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count}; // 3. PTX ISA 80, SM_90 (some variants are SM_80, but are covered by 1)
+mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}      // 4. PTX ISA 80, SM_90
+
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+
+// mbarrier.arrive.noComplete:
+mbarrier.arrive.noComplete{.shared}.b64 state, [addr], count;                       // 5. PTX ISA 70, SM_80
+mbarrier.arrive.noComplete{.shared{::cta}}.b64 state, [addr], count;                // 6. PTX ISA 78, Not exposed. Just a spelling change (shared -> shared::cta)
+mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;   // 7. PTX ISA 80, Not exposed. Adds .release, and .cta scope.
+
+
+// mbarrier.arrive.expect_tx:
+mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90
+mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], tx_count; // 9. PTX ISA 80, SM_90
+
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+
+Corresponding Exposure:
+
+// mbarrier_arrive:
+mbarrier.arrive.shared.b64                                  state,  [addr];           // 1. PTX ISA 70, SM_80, !memory
+// count is non-optional, otherwise 3 would not be distinguishable from 1
+mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2. PTX ISA 78, SM_90, !memory
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+// NOTE: .scope=.cta is dropped on purpose
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster}
+.sem   = { .release }
+.scope = { .cluster }
+
+// NOTE: .scope=.cta is dropped on purpose
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster}
+.sem   = { .release }
+.scope = { .cluster }
+
+
+// mbarrier_arrive_no_complete:
+mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5. PTX ISA 70, SM_80, !memory
+
+
+mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        state, [addr], tx_count;   // 8. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta }
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+// NOTE: .scope=.cta is dropped on purpose
+mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        _, [addr], tx_count;       // 9. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster }
+.sem   = { .release }
+.scope = { .cluster }
+
+*/
+
+/*
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  _CUDA_VSTD::uint64_t* __addr)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    _CUDA_VSTD::uint64_t __state;
+    asm (
+      "mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr))
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
+    asm (
+      "mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared.b64                   %0,  [%1];           // 3a. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared.b64                   %0,  [%1];           // 3a. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    }
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared.b64                   %0,  [%1], %2;    // 3b. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared.b64                   %0,  [%1], %2;    // 3b. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    }
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
+  sem_release_t,
+  scope_cluster_t,
+  space_cluster_t,
+  _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
+      :
+      : "r"(__as_ptr_smem(__addr))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
+  sem_release_t,
+  scope_cluster_t,
+  space_cluster_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    _CUDA_VSTD::uint64_t __state;
+    asm (
+      "mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 8. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 8. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    }
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
+  sem_release_t,
+  scope_cluster_t,
+  space_cluster_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__tx_count)
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+
+
+
+// 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
+
+// 9.7.12.15.15. Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
+
+// 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+
+// 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
new file mode 100644
index 0000000000..442c484e8e
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -0,0 +1,174 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_DOT_VARIANTS_H_
+#define _CUDA_PTX_DOT_VARIANTS_H_
+
+#include "../../__type_traits/integral_constant.h" // std::integral_constant
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+/*
+ * Public integral constant types and values for ".variant"s:
+ *
+ * - .sem:     acquire, release, ..
+ * - .space:   global, shared, constant, ..
+ * - .scope:   cta, cluster, gpu, ..
+ * - .op:      add, min, cas, ..
+ *
+ * For each .variant, the code below defines:
+ * - An enum `dot_variant` with each possible value
+ * - A type template `variant_t<dot_variant>`
+ * - Types `variant_A_t`, ..., `variant_Z_t`
+ * - Constexpr values `variant_A` of type `variant_A_t`
+ *
+ * These types enable specifying fine-grained overloads of a PTX binding. If a
+ * binding can handle multiple variants, then it is defined as:
+ *
+ * template <dot_variant var>
+ * [...] void ptx_binding(variant_t<var> __v) { ... }
+ *
+ * If it only handles a single variant, then it is defined as:
+ *
+ * [...] void ptx_binding(variant_A __v) { ... }
+ *
+ * If two variants have different behaviors or return types (see .space
+ * overloads of mbarrier.arrive.expect_tx for an example), then these can be
+ * provided as separate overloads of the same function:
+ *
+ * [...] void ptx_binding(variant_A __v) { ... }
+ * [...] int ptx_binding(variant_B __v) { ... }
+ *
+ */
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types
+enum class dot_sem
+{
+  acq_rel,
+  acquire,
+  relaxed,
+  release,
+  sc,
+  weak
+};
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces
+enum class dot_space
+{
+  global,
+  cluster, // The PTX spelling is shared::cluster
+  shared,  // The PTX spelling is shared::cta
+
+  // The following state spaces are unlikely to be used in cuda::ptx in the near
+  // future, so they are not exposed:
+
+  // reg,
+  // sreg,
+  // const_mem, // Using const_mem as `const` is reserved in C++.
+  // local,
+  // param,
+  // tex // deprecated
+};
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope
+enum class dot_scope
+{
+  cta,
+  cluster,
+  gpu,
+  sys
+};
+
+enum class dot_op
+{
+  add,
+  dec,
+  inc,
+  max,
+  min,
+  and_op, // Using and_op, as `and, or, xor` are reserved in C++.
+  or_op,
+  xor_op,
+  cas,
+  exch
+};
+
+template <dot_sem __sem>
+using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, __sem>;
+using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
+using sem_acquire_t = sem_t<dot_sem::acquire>;
+using sem_relaxed_t = sem_t<dot_sem::relaxed>;
+using sem_release_t = sem_t<dot_sem::release>;
+using sem_sc_t      = sem_t<dot_sem::sc>;
+using sem_weak_t    = sem_t<dot_sem::weak>;
+
+static constexpr sem_acq_rel_t sem_acq_rel{};
+static constexpr sem_acquire_t sem_acquire{};
+static constexpr sem_relaxed_t sem_relaxed{};
+static constexpr sem_release_t sem_release{};
+static constexpr sem_sc_t sem_sc{};
+static constexpr sem_weak_t sem_weak{};
+
+template <dot_space __spc>
+using space_t         = _CUDA_VSTD::integral_constant<dot_space, __spc>;
+using space_global_t  = space_t<dot_space::global>;
+using space_shared_t  = space_t<dot_space::shared>;
+using space_cluster_t = space_t<dot_space::cluster>;
+
+static constexpr space_global_t space_global{};
+static constexpr space_shared_t space_shared{};
+static constexpr space_cluster_t space_cluster{};
+
+template <dot_scope __scope>
+using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, __scope>;
+using scope_cluster_t = scope_t<dot_scope::cluster>;
+using scope_cta_t     = scope_t<dot_scope::cta>;
+using scope_gpu_t     = scope_t<dot_scope::gpu>;
+using scope_sys_t     = scope_t<dot_scope::sys>;
+
+static constexpr scope_cluster_t scope_cluster{};
+static constexpr scope_cta_t scope_cta{};
+static constexpr scope_gpu_t scope_gpu{};
+static constexpr scope_sys_t scope_sys{};
+
+template <dot_op __op>
+using op_t        = _CUDA_VSTD::integral_constant<dot_op, __op>;
+using op_add_t    = op_t<dot_op::add>;
+using op_dec_t    = op_t<dot_op::dec>;
+using op_inc_t    = op_t<dot_op::inc>;
+using op_max_t    = op_t<dot_op::max>;
+using op_min_t    = op_t<dot_op::min>;
+using op_and_op_t = op_t<dot_op::and_op>;
+using op_or_op_t  = op_t<dot_op::or_op>;
+using op_xor_op_t = op_t<dot_op::xor_op>;
+using op_cas_t    = op_t<dot_op::cas>;
+using op_exch_t   = op_t<dot_op::exch>;
+
+static constexpr op_add_t op_add{};
+static constexpr op_dec_t op_dec{};
+static constexpr op_inc_t op_inc{};
+static constexpr op_max_t op_max{};
+static constexpr op_min_t op_min{};
+static constexpr op_and_op_t op_and_op{};
+static constexpr op_or_op_t op_or_op{};
+static constexpr op_xor_op_t op_xor_op{};
+static constexpr op_cas_t op_cas{};
+static constexpr op_exch_t op_exch{};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_DOT_VARIANTS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
new file mode 100644
index 0000000000..f6ec0b3959
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -0,0 +1,62 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_
+#define _CUDA_PTX_HELPER_FUNCTIONS_H_
+
+#include "../../cstdint"        // uint32_t
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr)
+{
+  // Consider adding debug asserts here.
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_remote_dsmem(const void* __ptr)
+{
+  // No difference in implementation to __as_ptr_smem.
+  // Consider adding debug asserts here.
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr)
+{
+  // Consider adding debug asserts here.
+  return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
+}
+
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
+{
+  static_assert(sizeof(_Tp) == 4, "");
+  // Consider using std::bitcast
+  return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val);
+}
+
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
+{
+  static_assert(sizeof(_Tp) == 8, "");
+  // Consider using std::bitcast
+  return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_HELPER_FUNCTIONS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
new file mode 100644
index 0000000000..ca5297e4de
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -0,0 +1,75 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+
+#include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+/*
+ * Targeting macros
+ *
+ * Information from:
+ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+ */
+
+// PTX ISA 8.3 is available from CUDA 12.3, driver r545
+#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 830ULL
+// PTX ISA 8.2 is available from CUDA 12.2, driver r535
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 820ULL
+// PTX ISA 8.1 is available from CUDA 12.1, driver r530
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 810ULL
+// PTX ISA 8.0 is available from CUDA 12.0, driver r525
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 800ULL
+// PTX ISA 7.8 is available from CUDA 11.8, driver r520
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 780ULL
+// PTX ISA 7.7 is available from CUDA 11.7, driver r515
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 770ULL
+// PTX ISA 7.6 is available from CUDA 11.6, driver r510
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 760ULL
+// PTX ISA 7.5 is available from CUDA 11.5, driver r495
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 750ULL
+// PTX ISA 7.4 is available from CUDA 11.4, driver r470
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 740ULL
+// PTX ISA 7.3 is available from CUDA 11.3, driver r465
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 730ULL
+// PTX ISA 7.2 is available from CUDA 11.2, driver r460
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 720ULL
+// PTX ISA 7.1 is available from CUDA 11.1, driver r455
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 710ULL
+// PTX ISA 7.0 is available from CUDA 11.0, driver r445
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 700ULL
+// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
+#else
+#  define __cccl_ptx_isa 0ULL
+#endif
+
+#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_