Skip to content

Commit

Permalink
Fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed Feb 14, 2024
1 parent c9756c0 commit 6892523
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,26 @@ _LIBCUDACXX_HOST_DEVICE
return __result;
}

template <typename _Tp, int _Sco, bool _Ref>
_LIBCUDACXX_HOST_DEVICE
bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
// Both aref and atomic pass through here.
// static_assert(sizeof(_Tp) == 0);
alignas(_Tp) auto __tmp = *__expected;
bool __result = false;
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
alignas(_Tp) auto __tmp_v = __val;
__result = __atomic_compare_exchange_cuda(__cxx_get_underlying_device_atomic(__a), &__tmp, &__tmp_v, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), __scope_tag<_Sco>());
),
NV_IS_HOST, (
__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);
)
)
*__expected = __tmp;
return __result;
}

template <typename _Tp, int _Sco, bool _Ref>
_LIBCUDACXX_HOST_DEVICE
bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,42 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
memcpy(__expected, &__old, 4);
return __ret;
}

template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_70, (
switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
case __ATOMIC_SEQ_CST: __cuda_fence_sc_device();
case __ATOMIC_CONSUME:
case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_device(__ptr, __old, __old_tmp, __tmp); break;
case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_device(__ptr, __old, __old_tmp, __tmp); break;
case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_device(__ptr, __old, __old_tmp, __tmp); break;
case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_device(__ptr, __old, __old_tmp, __tmp); break;
default: assert(0);
}
),
NV_IS_DEVICE, (
switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
case __ATOMIC_SEQ_CST:
case __ATOMIC_ACQ_REL: __cuda_membar_device();
case __ATOMIC_CONSUME:
case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break;
case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break;
case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break;
default: assert(0);
}
)
)
bool const __ret = __old == __old_tmp;
memcpy(__expected, &__old, 4);
return __ret;
}

template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
Expand Down
49 changes: 12 additions & 37 deletions test_aref.ptx
Original file line number Diff line number Diff line change
Expand Up @@ -19,64 +19,39 @@
.param .u32 _Z6squarePiS_i_param_2
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<10>;
.reg .b32 %r<9>;
.reg .b64 %rd<7>;


mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [_Z6squarePiS_i_param_0];
ld.param.u64 %rd2, [_Z6squarePiS_i_param_1];
ld.param.u32 %r2, [_Z6squarePiS_i_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;

add.u64 %rd3, %SP, 0;
add.u64 %rd5, %SPL, 0;
cvta.to.global.u64 %rd4, %rd2;
// begin inline asm
// Before atom_{ref} definition
// end inline asm
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd1, %rd6;
st.local.u64 [%rd5], %rd7;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd3, %rd1, %rd5;
// begin inline asm
// After atom_{ref} definition
// end inline asm
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
add.s64 %rd6, %rd4, %rd5;
// begin inline asm
// split compare_exchange_strong before
// end inline asm
ld.global.u32 %r8, [%rd9];
mov.u32 %r6, 1;
ld.global.u32 %r7, [%rd6];
// begin inline asm
// before get underlying atomic ref = %r6
atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1;
// end inline asm
// begin inline asm
// deref get underlying atomic ref = %rd3
// end inline asm
// begin inline asm
// Inside get underlying atomic (1)
// end inline asm
ld.local.u64 %rd4, [%rd5];
// begin inline asm
// Inside get underlying atomic (2)
// end inline asm
// begin inline asm
// After get underlying atomic
// end inline asm
// begin inline asm
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
// end inline asm
st.global.u32 [%rd9], %r7;
st.global.u32 [%rd6], %r6;

$L__BB0_2:
ret;
Expand Down
31 changes: 9 additions & 22 deletions test_atomic.ptx
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
)
{
.reg .pred %p<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<8>;
.reg .b32 %r<9>;
.reg .b64 %rd<7>;


ld.param.u64 %rd1, [_Z6squarePiS_i_param_0];
Expand All @@ -34,34 +34,21 @@
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;

cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd4, %rd2;
// begin inline asm
// Before atom_{ref} definition
// end inline asm
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd4, %rd1, %rd6;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd3, %rd1, %rd5;
// begin inline asm
// After atom_{ref} definition
// end inline asm
add.s64 %rd7, %rd5, %rd6;
ld.global.u32 %r8, [%rd7];
mov.u32 %r6, 0;
add.s64 %rd6, %rd4, %rd5;
ld.global.u32 %r7, [%rd6];
// begin inline asm
// before get underlying atomic ref = %r6
atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1;
// end inline asm
// begin inline asm
// deref get underlying atomic ref = %rd4
// end inline asm
// begin inline asm
// Inside get underlying atomic (non-ref base_impl)
// end inline asm
// begin inline asm
// After get underlying atomic
// end inline asm
// begin inline asm
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
// end inline asm
st.global.u32 [%rd7], %r7;
st.global.u32 [%rd6], %r6;

$L__BB0_2:
ret;
Expand Down

0 comments on commit 6892523

Please sign in to comment.