Skip to content

Commit

Permalink
Debug atomic ref generating st.local and ld.local
Browse files Browse the repository at this point in the history
The problem seems to be that in the path to the actual PTX instruction,
a volatile pointer is dereferenced, which causes a spill to local.
  • Loading branch information
ahendriksen committed Feb 14, 2024
1 parent 74f1160 commit c9756c0
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 8 deletions.
10 changes: 9 additions & 1 deletion libcudacxx/include/cuda/std/detail/libcxx/include/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,7 @@ struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
bool compare_exchange_strong(_Tp& __e, _Tp __d,
memory_order __m = memory_order_seq_cst) noexcept {
if (memory_order_acq_rel == __m)
// Atomic goes through this path
return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
else if (memory_order_release == __m)
return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
Expand Down Expand Up @@ -1529,12 +1530,19 @@ struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_T
_LIBCUDACXX_INLINE_VISIBILITY
bool compare_exchange_strong(_Tp& __e, _Tp __d,
memory_order __m = memory_order_seq_cst) const noexcept {
if (memory_order_acq_rel == __m)
// Atomic_ref goes through this path.
asm volatile("// split compare_exchange_strong before":::"memory");

if (memory_order_acq_rel == __m){
return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);

}
else if (memory_order_release == __m)
return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
else
return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);

asm volatile("// split compare_exchange_strong after":::"memory");
}

_LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,16 @@ _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp
}

template <typename _Tp, int _Sco, bool _Ref>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
_LIBCUDACXX_INLINE_VISIBILITY
volatile _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) noexcept {
return __cxx_get_underlying_atomic(&__a->__a_value);
// static_assert(sizeof(_Tp) == 0);
asm volatile("// before get underlying atomic ref = %0" :: "r"(int(_Ref)): "memory");
auto deref = &__a->__a_value; // __host::__cxx_atomic_ref_base_impl
asm volatile("// deref get underlying atomic ref = %0" :: "l"(deref): "memory");

auto ret= __cxx_get_underlying_atomic(deref);
asm volatile("// After get underlying atomic" ::: "memory");
return ret;
}

template <typename _Tp, int _Sco, bool _Ref>
Expand Down Expand Up @@ -231,6 +238,8 @@ _LIBCUDACXX_HOST_DEVICE
template <typename _Tp, int _Sco, bool _Ref>
_LIBCUDACXX_HOST_DEVICE
bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
// Both aref and atomic pass through here.
// static_assert(sizeof(_Tp) == 0);
alignas(_Tp) auto __tmp = *__expected;
bool __result = false;
NV_DISPATCH_TARGET(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,10 @@ _LIBCUDACXX_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val,
}
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) {
// static_assert(sizeof(_CUDA_C) != 4);
asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory");
}
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) noexce
return &__a->__a_value;
}
template <typename _Tp, int _Sco>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
_LIBCUDACXX_INLINE_VISIBILITY
volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept {
return &__a->__a_value;
// static_assert(sizeof(_Tp) == 0);
asm volatile("// Inside get underlying atomic (non-ref base_impl)" ::: "memory");

return &__a->__a_value; // Because it does not actually dereference, this does not generate an ld.local (atomic path)
}
template <typename _Tp, int _Sco>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
Expand Down Expand Up @@ -109,9 +112,12 @@ _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noe
return __a->__a_value;
}
template <typename _Tp, int _Sco>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
_LIBCUDACXX_INLINE_VISIBILITY
volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept {
return __a->__a_value;
asm volatile("// Inside get underlying atomic (1)" ::: "memory");
auto deref = __a->__a_value; // <-- this deref loads from local, i.e. ld.local (atomic_ref path)
asm volatile("// Inside get underlying atomic (2)" ::: "memory");
return deref;
}
template <typename _Tp, int _Sco>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
Expand Down
25 changes: 25 additions & 0 deletions test.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

#include<cuda/atomic>

template <typename T, typename V> union U { T t; V v; };
using atom_t = cuda::atomic<int, cuda::thread_scope_device>*;
using aref_t = cuda::atomic_ref<int, cuda::thread_scope_device>;

// Type your code here, or load an example.
__global__ void square(int* data,
int* array,
int n) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) {
asm volatile("// Before atom_{ref} definition" ::: "memory");
// Spill to local happens here (for atomic_ref). (st.local)
#ifdef AREF
auto ref = aref_t{*(data + tid)};
#else
auto& ref = *U<atom_t, aref_t>{ .v = aref_t{*(data + tid)} }.t;
#endif
asm volatile("// After atom_{ref} definition" ::: "memory");

ref.compare_exchange_strong(array[tid], tid, cuda::std::memory_order_acquire);
}
}
85 changes: 85 additions & 0 deletions test_aref.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-32965470
// Cuda compilation tools, release 12.2, V12.2.91
// Based on NVVM 7.0.1
//

.version 8.2
.target sm_90
.address_size 64

// .globl _Z6squarePiS_i
.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3};

.visible .entry _Z6squarePiS_i(
.param .u64 _Z6squarePiS_i_param_0,
.param .u64 _Z6squarePiS_i_param_1,
.param .u32 _Z6squarePiS_i_param_2
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<10>;


mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [_Z6squarePiS_i_param_0];
ld.param.u64 %rd2, [_Z6squarePiS_i_param_1];
ld.param.u32 %r2, [_Z6squarePiS_i_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;

add.u64 %rd3, %SP, 0;
add.u64 %rd5, %SPL, 0;
// begin inline asm
// Before atom_{ref} definition
// end inline asm
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd1, %rd6;
st.local.u64 [%rd5], %rd7;
// begin inline asm
// After atom_{ref} definition
// end inline asm
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
// begin inline asm
// split compare_exchange_strong before
// end inline asm
ld.global.u32 %r8, [%rd9];
mov.u32 %r6, 1;
// begin inline asm
// before get underlying atomic ref = %r6
// end inline asm
// begin inline asm
// deref get underlying atomic ref = %rd3
// end inline asm
// begin inline asm
// Inside get underlying atomic (1)
// end inline asm
ld.local.u64 %rd4, [%rd5];
// begin inline asm
// Inside get underlying atomic (2)
// end inline asm
// begin inline asm
// After get underlying atomic
// end inline asm
// begin inline asm
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
// end inline asm
st.global.u32 [%rd9], %r7;

$L__BB0_2:
ret;

}

70 changes: 70 additions & 0 deletions test_atomic.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-32965470
// Cuda compilation tools, release 12.2, V12.2.91
// Based on NVVM 7.0.1
//

.version 8.2
.target sm_90
.address_size 64

// .globl _Z6squarePiS_i
.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3};

.visible .entry _Z6squarePiS_i(
.param .u64 _Z6squarePiS_i_param_0,
.param .u64 _Z6squarePiS_i_param_1,
.param .u32 _Z6squarePiS_i_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<8>;


ld.param.u64 %rd1, [_Z6squarePiS_i_param_0];
ld.param.u64 %rd2, [_Z6squarePiS_i_param_1];
ld.param.u32 %r2, [_Z6squarePiS_i_param_2];
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;

cvta.to.global.u64 %rd5, %rd2;
// begin inline asm
// Before atom_{ref} definition
// end inline asm
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd4, %rd1, %rd6;
// begin inline asm
// After atom_{ref} definition
// end inline asm
add.s64 %rd7, %rd5, %rd6;
ld.global.u32 %r8, [%rd7];
mov.u32 %r6, 0;
// begin inline asm
// before get underlying atomic ref = %r6
// end inline asm
// begin inline asm
// deref get underlying atomic ref = %rd4
// end inline asm
// begin inline asm
// Inside get underlying atomic (non-ref base_impl)
// end inline asm
// begin inline asm
// After get underlying atomic
// end inline asm
// begin inline asm
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
// end inline asm
st.global.u32 [%rd7], %r7;

$L__BB0_2:
ret;

}

0 comments on commit c9756c0

Please sign in to comment.