Backport 574 ptx (#663)

* Add `cuda::ptx::*` namespace (#574) * fixup `___CUDA_VPTX` -> `_CUDA_VPTX` (#664) * fixup `___CUDA_VPTX` -> `_CUDA_VPTX` * Fix warning for unused variable in branches that are constexpr disabled. --------- Co-authored-by: Allard Hendriksen <[email protected]> Co-authored-by: Wesley Maxey <[email protected]>
NVIDIA · Nov 8, 2023 · bb37c94 · bb37c94
1 parent db1312e
commit bb37c94
Show file tree

Hide file tree

Showing 12 changed files with 2,334 additions and 30 deletions.
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+template <typename ... _Ty>
+__device__ inline bool __unused(_Ty...) { return true; }
+
+__global__ void test_compilation() {
+  using cuda::ptx::sem_release;
+  using cuda::ptx::space_cluster;
+  using cuda::ptx::space_shared;
+  using cuda::ptx::scope_cluster;
+  using cuda::ptx::scope_cta;
+
+  __shared__ uint64_t bar;
+  bar = 1;
+  uint64_t state = 1;
+
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80, (
+    state = cuda::ptx::mbarrier_arrive(&bar);                                                        // 1.
+    state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1);                                         // 5.
+  ));
+#endif // __cccl_ptx_isa >= 700
+
+  // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(&bar, 1);                                                     // 2.
+  ));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar);                  // 3a.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);              // 3a.
+
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1);               // 3b.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 3b.
+
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar);                     // 4a.
+
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 4b.
+
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);     // 8.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.
+
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 9.
+  ));
+#endif // __cccl_ptx_isa >= 800
+  __unused(bar, state);
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
@@ -19,6 +19,8 @@
     #endif
 #endif
 
+#include <cuda/std/cassert>
+
 #include "test_macros.h"
 
 TEST_EXEC_CHECK_DISABLE

diff --git a/libcudacxx/docs/extended_api.md b/libcudacxx/docs/extended_api.md
@@ -21,6 +21,8 @@ nav_order: 3
 
 {% include_relative extended_api/functional.md %}
 
+{% include_relative extended_api/ptx.md %}
+
 [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
 [Thread Groups]: ./extended_api/thread_groups.md
 
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
@@ -0,0 +1,23 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX
+#define _CUDA_PTX
+
+#include "std/detail/__config"
+
+#include "std/detail/__pragma_push"
+
+#include "std/detail/libcxx/include/__cuda/ptx.h"
+
+#include "std/detail/__pragma_pop"
+
+#endif // _CUDA_PTX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -1505,6 +1505,9 @@ typedef __char32_t char32_t;
 #define _LIBCUDACXX_END_NAMESPACE_CUDA  } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
+#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
+#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX  } } }
+#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL  } } } }
 #endif

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -27,6 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include "../cstdlib"                // _LIBCUDACXX_UNREACHABLE
 #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
+#include "../__cuda/ptx.h"           // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
@@ -206,29 +207,21 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 else if (!__isShared(&__barrier)) {
                     __trap();
                 }
-
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                    "r"(static_cast<_CUDA_VSTD::uint32_t>(__update))
-                    : "memory");
+                // Cannot use cuda::device::barrier_native_handle here, as it is
+                // only defined for block-scope barriers. This barrier may be a
+                // non-block scoped barrier.
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
+                __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);
             ), NV_PROVIDES_SM_80, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
                 }
-
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
                 // Need 2 instructions, can't finish barrier with arrive > 1
                 if (__update > 1) {
-                    asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
-                        : "=l"(__token)
-                        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                            "r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1))
-                        : "memory");
+                    _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1);
                 }
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier)))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive( __bh);
             ), NV_IS_DEVICE, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
@@ -603,27 +596,22 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
             // us in release builds. In debug builds, the error would be caught
             // by the asserts at the top of this function.
 
-            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+            auto __native_handle = barrier_native_handle(__b);
+            auto __bh = __cvta_generic_to_shared(__native_handle);
             if (__arrive_count_update == 1) {
-                asm (
-                    "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update
+                );
             } else {
                 asm (
                     "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
                     :
                     : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
                     : "memory");
-                asm (
-                    "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
+                );
             }
         )
     );
@@ -1089,8 +1077,8 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
     ));
 
     NV_IF_TARGET(NV_PROVIDES_SM_80, (
-        const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
         if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
+            const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
             if (__can_use_async_group) {
                 __cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
                 return __completion_mechanism::__async_group;