From 6892523fe82b03e7f0e6978734e19947a016b35d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 14 Feb 2024 13:20:30 +0100
Subject: [PATCH] Fixed

---
 .../include/support/atomic/atomic_cuda.h      | 20 ++++++++
 .../support/atomic/atomic_cuda_generated.h    | 36 ++++++++++++++
 test_aref.ptx                                 | 49 +++++--------------
 test_atomic.ptx                               | 31 ++++--------
 4 files changed, 77 insertions(+), 59 deletions(-)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
index a9b32bbbff7..1ddaaa5692f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
@@ -255,6 +255,26 @@ _LIBCUDACXX_HOST_DEVICE
     return __result;
 }
 
+template <typename _Tp, int _Sco, bool _Ref>
+_LIBCUDACXX_HOST_DEVICE
+ bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
+    // Both aref and atomic pass through here.
+// static_assert(sizeof(_Tp) == 0);
+    alignas(_Tp) auto __tmp = *__expected;
+    bool __result = false;
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            alignas(_Tp) auto __tmp_v = __val;
+            __result = __atomic_compare_exchange_cuda(__cxx_get_underlying_device_atomic(__a), &__tmp, &__tmp_v, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), __scope_tag<_Sco>());
+        ),
+        NV_IS_HOST, (
+            __result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);
+        )
+    )
+    *__expected = __tmp;
+    return __result;
+}
+
 template <typename _Tp, int _Sco, bool _Ref>
 _LIBCUDACXX_HOST_DEVICE
  bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
index 2969369c887..96530c4f5d7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
@@ -1372,6 +1372,42 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
     memcpy(__expected, &__old, 4);
     return __ret;
 }
+
+template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
+_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+    uint32_t __tmp = 0, __old = 0, __old_tmp;
+    memcpy(&__tmp, __desired, 4);
+    memcpy(&__old, __expected, 4);
+    __old_tmp = __old;
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_70, (
+        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+          case __ATOMIC_SEQ_CST: __cuda_fence_sc_device();
+          case __ATOMIC_CONSUME:
+          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          default: assert(0);
+        }
+      ),
+      NV_IS_DEVICE, (
+        switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {
+          case __ATOMIC_SEQ_CST:
+          case __ATOMIC_ACQ_REL: __cuda_membar_device();
+          case __ATOMIC_CONSUME:
+          case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break;
+          case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break;
+          default: assert(0);
+        }
+      )
+    )
+    bool const __ret = __old == __old_tmp;
+    memcpy(__expected, &__old, 4);
+    return __ret;
+}
+
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acq_rel.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.acquire.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C> static inline _LIBCUDACXX_DEVICE void __cuda_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.exch.relaxed.gpu.b32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); }
diff --git a/test_aref.ptx b/test_aref.ptx
index 9b2238ee4b1..7b1f2c6bdc0 100644
--- a/test_aref.ptx
+++ b/test_aref.ptx
@@ -19,64 +19,39 @@
 	.param .u32 _Z6squarePiS_i_param_2
 )
 {
-	.local .align 8 .b8 	__local_depot0[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
 	.reg .pred 	%p<2>;
-	.reg .b32 	%r<10>;
-	.reg .b64 	%rd<10>;
+	.reg .b32 	%r<9>;
+	.reg .b64 	%rd<7>;
 
 
-	mov.u64 	%SPL, __local_depot0;
-	cvta.local.u64 	%SP, %SPL;
 	ld.param.u64 	%rd1, [_Z6squarePiS_i_param_0];
 	ld.param.u64 	%rd2, [_Z6squarePiS_i_param_1];
 	ld.param.u32 	%r2, [_Z6squarePiS_i_param_2];
-	mov.u32 	%r3, %ctaid.x;
-	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r3, %ntid.x;
+	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %tid.x;
-	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	mad.lo.s32 	%r1, %r3, %r4, %r5;
 	setp.ge.s32 	%p1, %r1, %r2;
 	@%p1 bra 	$L__BB0_2;
 
-	add.u64 	%rd3, %SP, 0;
-	add.u64 	%rd5, %SPL, 0;
+	cvta.to.global.u64 	%rd4, %rd2;
 	// begin inline asm
 	// Before atom_{ref} definition
 	// end inline asm
-	mul.wide.s32 	%rd6, %r1, 4;
-	add.s64 	%rd7, %rd1, %rd6;
-	st.local.u64 	[%rd5], %rd7;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd3, %rd1, %rd5;
 	// begin inline asm
 	// After atom_{ref} definition
 	// end inline asm
-	cvta.to.global.u64 	%rd8, %rd2;
-	add.s64 	%rd9, %rd8, %rd6;
+	add.s64 	%rd6, %rd4, %rd5;
 	// begin inline asm
 	// split compare_exchange_strong before
 	// end inline asm
-	ld.global.u32 	%r8, [%rd9];
-	mov.u32 	%r6, 1;
+	ld.global.u32 	%r7, [%rd6];
 	// begin inline asm
-	// before get underlying atomic ref = %r6
+	atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1;
 	// end inline asm
-	// begin inline asm
-	// deref get underlying atomic ref = %rd3
-	// end inline asm
-	// begin inline asm
-	// Inside get underlying atomic (1)
-	// end inline asm
-	ld.local.u64 	%rd4, [%rd5];
-	// begin inline asm
-	// Inside get underlying atomic (2)
-	// end inline asm
-	// begin inline asm
-	// After get underlying atomic
-	// end inline asm
-	// begin inline asm
-	atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
-	// end inline asm
-	st.global.u32 	[%rd9], %r7;
+	st.global.u32 	[%rd6], %r6;
 
 $L__BB0_2:
 	ret;
diff --git a/test_atomic.ptx b/test_atomic.ptx
index 80f0e141ef5..c63ebbe41ed 100644
--- a/test_atomic.ptx
+++ b/test_atomic.ptx
@@ -20,8 +20,8 @@
 )
 {
 	.reg .pred 	%p<2>;
-	.reg .b32 	%r<10>;
-	.reg .b64 	%rd<8>;
+	.reg .b32 	%r<9>;
+	.reg .b64 	%rd<7>;
 
 
 	ld.param.u64 	%rd1, [_Z6squarePiS_i_param_0];
@@ -34,34 +34,21 @@
 	setp.ge.s32 	%p1, %r1, %r2;
 	@%p1 bra 	$L__BB0_2;
 
-	cvta.to.global.u64 	%rd5, %rd2;
+	cvta.to.global.u64 	%rd4, %rd2;
 	// begin inline asm
 	// Before atom_{ref} definition
 	// end inline asm
-	mul.wide.s32 	%rd6, %r1, 4;
-	add.s64 	%rd4, %rd1, %rd6;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd3, %rd1, %rd5;
 	// begin inline asm
 	// After atom_{ref} definition
 	// end inline asm
-	add.s64 	%rd7, %rd5, %rd6;
-	ld.global.u32 	%r8, [%rd7];
-	mov.u32 	%r6, 0;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.u32 	%r7, [%rd6];
 	// begin inline asm
-	// before get underlying atomic ref = %r6
+	atom.cas.acquire.gpu.b32 %r6,[%rd3],%r7,%r1;
 	// end inline asm
-	// begin inline asm
-	// deref get underlying atomic ref = %rd4
-	// end inline asm
-	// begin inline asm
-	// Inside get underlying atomic (non-ref base_impl)
-	// end inline asm
-	// begin inline asm
-	// After get underlying atomic
-	// end inline asm
-	// begin inline asm
-	atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
-	// end inline asm
-	st.global.u32 	[%rd7], %r7;
+	st.global.u32 	[%rd6], %r6;
 
 $L__BB0_2:
 	ret;