Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ptx usage to account for PTX ISA availability #1359

Merged
merged 9 commits into from
Feb 21, 2024
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
// necessary to prevent cudafe from ripping out the device functions before
// device compilation begins.
#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function below are not strictly speaking part of the experimental exposure, but the check for the feature is currently the same as the check for availability of cp.async.bulk would be. Not a blocker imho, just want to note this.


// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _LIBCUDACXX_DEVICE
Expand Down Expand Up @@ -288,7 +288,7 @@ void cp_async_bulk_wait_group_read()
: "memory");
}

#endif // __CUDA_MINIMUM_ARCH__
#endif // __cccl_lib_experimental_ctk12_cp_async_exposure

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CCCL_PTX_ISA_H_
#define __CCCL_PTX_ISA_H_

#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
#include "../__cccl/compiler.h"
#include "../__cccl/system_header.h"

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
Expand All @@ -23,6 +22,8 @@
# pragma system_header
#endif // no system header

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

/*
* Targeting macros
*
Expand All @@ -31,47 +32,75 @@
*/

// PTX ISA 8.3 is available from CUDA 12.3, driver r545
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
// The first define is for future major versions of CUDACC.
// We make sure that these get the highest known PTX ISA version.
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 830ULL
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
miscco marked this conversation as resolved.
Show resolved Hide resolved
miscco marked this conversation as resolved.
Show resolved Hide resolved
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 830ULL
// PTX ISA 8.2 is available from CUDA 12.2, driver r535
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 820ULL
// PTX ISA 8.1 is available from CUDA 12.1, driver r530
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 810ULL
// PTX ISA 8.0 is available from CUDA 12.0, driver r525
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 800ULL
// PTX ISA 7.8 is available from CUDA 11.8, driver r520
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 780ULL
// PTX ISA 7.7 is available from CUDA 11.7, driver r515
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 770ULL
// PTX ISA 7.6 is available from CUDA 11.6, driver r510
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 760ULL
// PTX ISA 7.5 is available from CUDA 11.5, driver r495
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 750ULL
// PTX ISA 7.4 is available from CUDA 11.4, driver r470
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 740ULL
// PTX ISA 7.3 is available from CUDA 11.3, driver r465
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 730ULL
// PTX ISA 7.2 is available from CUDA 11.2, driver r460
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 720ULL
// PTX ISA 7.1 is available from CUDA 11.1, driver r455
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 710ULL
// PTX ISA 7.0 is available from CUDA 11.0, driver r445
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
|| (!defined(__CUDACC_VER_MAJOR__))
# define __cccl_ptx_isa 700ULL
// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
#else
# define __cccl_ptx_isa 0ULL
#endif

#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
// We define certain feature test macros depending on availability. When
// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
// depending on PTX ISA. This permits checking for the feature in host code.
// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
// hardware supports it.
#if (!defined(__CUDA_MINIMUM_ARCH__)) \
|| (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) && __cccl_isa_ptx >= 800
# define __cccl_lib_local_barrier_arrive_tx
# define __cccl_lib_experimental_ctk12_cp_async_exposure
#endif

#endif // __CCCL_PTX_ISA_H_
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "__cccl/diagnostic.h"
#include "__cccl/dialect.h"
#include "__cccl/execution_space.h"
#include "__cccl/ptx_isa.h"
#include "__cccl/system_header.h"
#include "__cccl/version.h"
#include "__cccl/visibility.h"
Expand Down
Loading
Loading