Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ptx usage to account for PTX ISA availability #1359

Merged
merged 9 commits into from
Feb 21, 2024
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
// necessary to prevent cudafe from ripping out the device functions before
// device compilation begins.
#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function below are not strictly speaking part of the experimental exposure, but the check for the feature is currently the same as the check for availability of cp.async.bulk would be. Not a blocker imho, just want to note this.


// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _LIBCUDACXX_DEVICE
Expand Down Expand Up @@ -288,7 +288,7 @@ void cp_async_bulk_wait_group_read()
: "memory");
}

#endif // __CUDA_MINIMUM_ARCH__
#endif // __cccl_lib_experimental_ctk12_cp_async_exposure

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CCCL_PTX_ISA_H_
#define __CCCL_PTX_ISA_H_

#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
#include "../__cccl/compiler.h"
#include "../__cccl/system_header.h"

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
Expand All @@ -23,6 +22,8 @@
# pragma system_header
#endif // no system header

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

/*
* Targeting macros
*
Expand All @@ -31,47 +32,75 @@
*/

// PTX ISA 8.3 is available from CUDA 12.3, driver r545
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
// The first define is for future major versions of CUDACC.
// We make sure that these get the highest known PTX ISA version.
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 830ULL
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
miscco marked this conversation as resolved.
Show resolved Hide resolved
miscco marked this conversation as resolved.
Show resolved Hide resolved
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 830ULL
// PTX ISA 8.2 is available from CUDA 12.2, driver r535
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 820ULL
// PTX ISA 8.1 is available from CUDA 12.1, driver r530
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 810ULL
// PTX ISA 8.0 is available from CUDA 12.0, driver r525
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 800ULL
// PTX ISA 7.8 is available from CUDA 11.8, driver r520
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 780ULL
// PTX ISA 7.7 is available from CUDA 11.7, driver r515
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 770ULL
// PTX ISA 7.6 is available from CUDA 11.6, driver r510
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 760ULL
// PTX ISA 7.5 is available from CUDA 11.5, driver r495
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 750ULL
// PTX ISA 7.4 is available from CUDA 11.4, driver r470
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 740ULL
// PTX ISA 7.3 is available from CUDA 11.3, driver r465
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 730ULL
// PTX ISA 7.2 is available from CUDA 11.2, driver r460
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 720ULL
// PTX ISA 7.1 is available from CUDA 11.1, driver r455
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 710ULL
// PTX ISA 7.0 is available from CUDA 11.0, driver r445
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 700ULL
// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
#else
# define __cccl_ptx_isa 0ULL
#endif

#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
// We define certain feature test macros depending on availability. When
// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
// depending on PTX ISA. This permits checking for the feature in host code.
// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
// hardware supports it.
#if (!defined(__CUDA_MINIMUM_ARCH__)) \
|| (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_isa_ptx >= 800
# define __cccl_lib_local_barrier_arrive_tx
# define __cccl_lib_experimental_ctk12_cp_async_exposure
#endif

#endif // __CCCL_PTX_ISA_H_
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "__cccl/diagnostic.h"
#include "__cccl/dialect.h"
#include "__cccl/execution_space.h"
#include "__cccl/ptx_isa.h"
#include "__cccl/system_header.h"
#include "__cccl/version.h"
#include "__cccl/visibility.h"
Expand Down
104 changes: 67 additions & 37 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>

#if defined(_CCCL_CUDA_COMPILER)

// Hide arrive_tx when CUDA architecture is insufficient. Note the
// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))

#if __cccl_ptx_isa >= 800
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
barrier<thread_scope_block> & __b,
Expand All @@ -591,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

barrier<thread_scope_block>::arrival_token __token = {};
NV_IF_TARGET(
NV_IF_ELSE_TARGET(
// On architectures pre-sm90, arrive_tx is not supported.
NV_PROVIDES_SM_90, (
// We do not check for the statespace of the barrier here. This is
Expand Down Expand Up @@ -619,11 +616,47 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
);
}
),(
__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
)
);
return __token;
}

extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
_LIBCUDACXX_DEVICE inline
void barrier_expect_tx(
barrier<thread_scope_block> & __b,
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {

_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.
NV_IF_ELSE_TARGET(
// On architectures pre-sm90, arrive_tx is not supported.
NV_PROVIDES_SM_90, (
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
),(
__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
));
}

extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
_Tp* __dest,
Expand All @@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");

NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
if (__isShared(__dest) && __isGlobal(__src)) {
asm volatile(
Expand All @@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
// is not yet implemented. So we trap in this case as well.
_LIBCUDACXX_UNREACHABLE();
}
),(
__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
));

return async_contract_fulfillment::async;
}

_LIBCUDACXX_DEVICE inline
void barrier_expect_tx(
barrier<thread_scope_block> & __b,
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {

_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
}
#endif // __CUDA_MINIMUM_ARCH__
#endif // __cccl_ptx_isa >= 800
#endif // _CCCL_CUDA_COMPILER

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
Expand Down Expand Up @@ -763,6 +774,7 @@ _CUDA_VSTD::uint64_t * __try_get_barrier_handle<::cuda::thread_scope_block, _CUD
// The user is still responsible for arriving and waiting on (or otherwise
// synchronizing with) the barrier or pipeline barrier to see the results of
// copies from other threads participating in the synchronization object.
extern "C" _LIBCUDACXX_HOST_DEVICE void __cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
struct __memcpy_completion_impl {

template<typename _Group>
Expand Down Expand Up @@ -796,13 +808,17 @@ struct __memcpy_completion_impl {
// bulk group to be used with shared memory barriers.
_LIBCUDACXX_UNREACHABLE();
case __completion_mechanism::__mbarrier_complete_tx:
#if __cccl_isa_ptx >= 800
// Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// Only perform the expect_tx operation with the leader thread
if (__group.thread_rank() == 0) {
::cuda::device::barrier_expect_tx(__barrier, __size);
}
),(
__cuda_ptx_mbarrier_complete_tx_is_not_supported_before_SM_90__();
));
#endif // __cccl_isa_ptx >= 800
return async_contract_fulfillment::async;
case __completion_mechanism::__sync:
// sync: In this case, we do not need to do anything. The user will have
Expand Down Expand Up @@ -929,11 +945,13 @@ struct __memcpy_completion_impl {
* 5. normal synchronous copy (fallback)
***********************************************************************/

#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
#if __cccl_ptx_isa >= 800
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
template <typename _Group>
inline __device__
void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
if (__g.thread_rank() == 0) {
asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
Expand All @@ -944,10 +962,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
: "memory");
}
),(
__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
));
}
#endif // __CUDA_MINIMUM_ARCH__
#endif // __cccl_ptx_isa >= 800

#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
template <size_t _Copy_size>
inline __device__
void __cp_async_shared_global(char * __dest, const char * __src) {
Expand All @@ -959,27 +980,35 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
#endif // _CCCL_STD_VER >= 2017

NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
asm volatile(
"cp.async.ca.shared.global [%0], [%1], %2, %2;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"n"(_Copy_size)
: "memory");
),(
__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
));
}

template <>
inline __device__
void __cp_async_shared_global<16>(char * __dest, const char * __src) {
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
// When copying 16 bytes, it is possible to skip L1 cache (.cg).
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
asm volatile(
"cp.async.cg.shared.global [%0], [%1], %2, %2;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"n"(16)
: "memory");
),(
__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
));
}

template <size_t _Alignment, typename _Group>
Expand All @@ -1002,7 +1031,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
__cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
}
}
#endif // __CUDA_MINIMUM_ARCH__

template <size_t _Copy_size>
struct __copy_chunk {
Expand Down Expand Up @@ -1083,6 +1111,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
template<_CUDA_VSTD::size_t _Align, typename _Group>
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
__completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
_LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
Expand All @@ -1094,6 +1123,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
}
// Fallthrough to SM 80..
));
#endif // __cccl_ptx_isa >= 800
miscco marked this conversation as resolved.
Show resolved Hide resolved

NV_IF_TARGET(NV_PROVIDES_SM_80, (
if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

#include "../__cuda/ptx/ptx_isa_target_macros.h"
#include "../__cuda/ptx/ptx_dot_variants.h"
#include "../__cuda/ptx/ptx_helper_functions.h"
#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
Expand Down
Loading
Loading