From 6892523fe82b03e7f0e6978734e19947a016b35d Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 14 Feb 2024 13:20:30 +0100 Subject: [PATCH] Fixed --- .../include/support/atomic/atomic_cuda.h | 20 ++++++++ .../support/atomic/atomic_cuda_generated.h | 36 ++++++++++++++ test_aref.ptx | 49 +++++-------------- test_atomic.ptx | 31 ++++-------- 4 files changed, 77 insertions(+), 59 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h index a9b32bbbff7..1ddaaa5692f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h @@ -255,6 +255,26 @@ _LIBCUDACXX_HOST_DEVICE return __result; } +template +_LIBCUDACXX_HOST_DEVICE + bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { + // Both aref and atomic pass through here. +// static_assert(sizeof(_Tp) == 0); + alignas(_Tp) auto __tmp = *__expected; + bool __result = false; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + alignas(_Tp) auto __tmp_v = __val; + __result = __atomic_compare_exchange_cuda(__cxx_get_underlying_device_atomic(__a), &__tmp, &__tmp_v, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), __scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + __result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure); + ) + ) + *__expected = __tmp; + return __result; +} + template _LIBCUDACXX_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h index 2969369c887..96530c4f5d7 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h @@ -1372,6 +1372,42 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T memcpy(__expected, &__old, 4); return __ret; } + +template = 0> +_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { + uint32_t __tmp = 0, __old = 0, __old_tmp; + memcpy(&__tmp, __desired, 4); + memcpy(&__old, __expected, 4); + __old_tmp = __old; + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) + bool const __ret = __old == __old_tmp; + memcpy(__expected, &__old, 4); + return __ret; +} + template static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } template static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } template static inline _LIBCUDACXX_DEVICE void __cuda_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } diff --git a/test_aref.ptx b/test_aref.ptx index 9b2238ee4b1..7b1f2c6bdc0 100644 --- a/test_aref.ptx +++ b/test_aref.ptx @@ -19,64 +19,39 @@ .param .u32 _Z6squarePiS_i_param_2 ) { - .local .align 8 .b8 __local_depot0[8]; - .reg .b64 %SP; - .reg .b64 %SPL; .reg .pred %p<2>; - .reg .b32 %r<10>; - .reg .b64 %rd<10>; + .reg .b32 %r<9>; + .reg .b64 %rd<7>; - mov.u64 %SPL, __local_depot0; - cvta.local.u64 %SP, %SPL; ld.param.u64 %rd1, [_Z6squarePiS_i_param_0]; ld.param.u64 %rd2, [_Z6squarePiS_i_param_1]; ld.param.u32 %r2, [_Z6squarePiS_i_param_2]; - mov.u32 %r3, %ctaid.x; - mov.u32 %r4, %ntid.x; + mov.u32 %r3, %ntid.x; + mov.u32 %r4, %ctaid.x; mov.u32 %r5, %tid.x; - mad.lo.s32 %r1, %r4, %r3, %r5; + mad.lo.s32 %r1, %r3, %r4, %r5; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; - add.u64 %rd3, %SP, 0; - add.u64 %rd5, %SPL, 0; + cvta.to.global.u64 %rd4, %rd2; // begin inline asm // Before atom_{ref} definition // end inline asm - mul.wide.s32 %rd6, %r1, 4; - add.s64 %rd7, %rd1, %rd6; - st.local.u64 [%rd5], %rd7; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd3, %rd1, %rd5; // begin inline asm // After atom_{ref} definition // end inline asm - cvta.to.global.u64 %rd8, %rd2; - add.s64 %rd9, %rd8, %rd6; + add.s64 %rd6, %rd4, %rd5; // begin inline asm // split compare_exchange_strong before // end inline asm - ld.global.u32 %r8, [%rd9]; - mov.u32 %r6, 1; + ld.global.u32 %r7, [%rd6]; // begin inline asm - // before get underlying atomic ref = %r6 + atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1; // end inline asm - // begin inline asm - // deref get underlying atomic ref = %rd3 - // end inline asm - // begin inline asm - // Inside get underlying atomic (1) - // end inline asm - ld.local.u64 %rd4, [%rd5]; - // begin inline asm - // Inside get underlying atomic (2) - // end inline asm - // begin inline asm - // After get underlying atomic - // end inline asm - // begin inline asm - atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1; - // end inline asm - st.global.u32 [%rd9], %r7; + st.global.u32 [%rd6], %r6; $L__BB0_2: ret; diff --git a/test_atomic.ptx b/test_atomic.ptx index 80f0e141ef5..c63ebbe41ed 100644 --- a/test_atomic.ptx +++ b/test_atomic.ptx @@ -20,8 +20,8 @@ ) { .reg .pred %p<2>; - .reg .b32 %r<10>; - .reg .b64 %rd<8>; + .reg .b32 %r<9>; + .reg .b64 %rd<7>; ld.param.u64 %rd1, [_Z6squarePiS_i_param_0]; @@ -34,34 +34,21 @@ setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB0_2; - cvta.to.global.u64 %rd5, %rd2; + cvta.to.global.u64 %rd4, %rd2; // begin inline asm // Before atom_{ref} definition // end inline asm - mul.wide.s32 %rd6, %r1, 4; - add.s64 %rd4, %rd1, %rd6; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd3, %rd1, %rd5; // begin inline asm // After atom_{ref} definition // end inline asm - add.s64 %rd7, %rd5, %rd6; - ld.global.u32 %r8, [%rd7]; - mov.u32 %r6, 0; + add.s64 %rd6, %rd4, %rd5; + ld.global.u32 %r7, [%rd6]; // begin inline asm - // before get underlying atomic ref = %r6 + atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1; // end inline asm - // begin inline asm - // deref get underlying atomic ref = %rd4 - // end inline asm - // begin inline asm - // Inside get underlying atomic (non-ref base_impl) - // end inline asm - // begin inline asm - // After get underlying atomic - // end inline asm - // begin inline asm - atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1; - // end inline asm - st.global.u32 [%rd7], %r7; + st.global.u32 [%rd6], %r6; $L__BB0_2: ret;