Debug atomic ref generating st.local and ld.local

The problem seems to be that in the path to the actual PTX instruction, a volatile pointer is dereferenced, which causes a spill to local.
ahendriksen · Feb 14, 2024 · c9756c0 · c9756c0
1 parent 74f1160
commit c9756c0
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 8 deletions.
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
@@ -1405,6 +1405,7 @@ struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __m = memory_order_seq_cst) noexcept {
         if (memory_order_acq_rel == __m)
+            // Atomic goes through this path
             return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
             return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
@@ -1529,12 +1530,19 @@ struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_T
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __m = memory_order_seq_cst) const noexcept {
-        if (memory_order_acq_rel == __m)
+            // Atomic_ref goes through this path.
+            asm volatile("// split compare_exchange_strong before":::"memory");
+
+      if (memory_order_acq_rel == __m){
             return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+
+      }
         else if (memory_order_release == __m)
             return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
             return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+
+            asm volatile("// split compare_exchange_strong after":::"memory");
     }
 
     _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
@@ -112,9 +112,16 @@ _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp
 }
 
 template <typename _Tp, int _Sco, bool _Ref>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr
+_LIBCUDACXX_INLINE_VISIBILITY
 volatile _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) noexcept {
-  return __cxx_get_underlying_atomic(&__a->__a_value);
+  // static_assert(sizeof(_Tp) == 0);
+  asm volatile("// before get underlying atomic ref = %0" :: "r"(int(_Ref)): "memory");
+  auto deref = &__a->__a_value; // __host::__cxx_atomic_ref_base_impl
+  asm volatile("// deref get underlying atomic ref = %0" :: "l"(deref): "memory");
+
+  auto ret= __cxx_get_underlying_atomic(deref);
+  asm volatile("// After get underlying atomic" ::: "memory");
+  return ret;
 }
 
 template <typename _Tp, int _Sco, bool _Ref>
@@ -231,6 +238,8 @@ _LIBCUDACXX_HOST_DEVICE
 template <typename _Tp, int _Sco, bool _Ref>
 _LIBCUDACXX_HOST_DEVICE
  bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) {
+    // Both aref and atomic pass through here.
+// static_assert(sizeof(_Tp) == 0);
     alignas(_Tp) auto __tmp = *__expected;
     bool __result = false;
     NV_DISPATCH_TARGET(

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
@@ -1332,7 +1332,10 @@ _LIBCUDACXX_DEVICE void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val,
 }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
-template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
+template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) {
+    // static_assert(sizeof(_CUDA_C) != 4);
+    asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory");
+}
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h
@@ -43,9 +43,12 @@ _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) noexce
   return &__a->__a_value;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr
+_LIBCUDACXX_INLINE_VISIBILITY
 volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept {
-  return &__a->__a_value;
+  // static_assert(sizeof(_Tp) == 0);
+  asm volatile("// Inside get underlying atomic (non-ref base_impl)" ::: "memory");
+
+  return &__a->__a_value; // Because it does not actually dereference, this does not generate an ld.local (atomic path)
 }
 template <typename _Tp, int _Sco>
 _LIBCUDACXX_INLINE_VISIBILITY constexpr
@@ -109,9 +112,12 @@ _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noe
   return __a->__a_value;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr
+_LIBCUDACXX_INLINE_VISIBILITY
 volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept {
-  return __a->__a_value;
+  asm volatile("// Inside get underlying atomic (1)" ::: "memory");
+  auto deref = __a->__a_value; // <-- this deref loads from local, i.e. ld.local (atomic_ref path)
+  asm volatile("// Inside get underlying atomic (2)" ::: "memory");
+  return deref;
 }
 template <typename _Tp, int _Sco>
 _LIBCUDACXX_INLINE_VISIBILITY constexpr

diff --git a/test.cu b/test.cu
@@ -0,0 +1,25 @@
+
+#include<cuda/atomic>
+
+template <typename T, typename V> union U { T t; V v; };
+using atom_t = cuda::atomic<int, cuda::thread_scope_device>*;
+using aref_t = cuda::atomic_ref<int, cuda::thread_scope_device>;
+
+// Type your code here, or load an example.
+__global__ void square(int* data,
+                       int* array,
+                       int n) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid < n) {
+        asm volatile("// Before atom_{ref} definition" ::: "memory");
+        // Spill to  local happens here (for atomic_ref). (st.local)
+        #ifdef AREF
+        auto ref = aref_t{*(data + tid)};
+        #else
+        auto& ref = *U<atom_t, aref_t>{ .v = aref_t{*(data + tid)} }.t;
+        #endif
+        asm volatile("// After atom_{ref} definition" ::: "memory");
+
+        ref.compare_exchange_strong(array[tid], tid, cuda::std::memory_order_acquire);
+    }
+}
diff --git a/test_aref.ptx b/test_aref.ptx
@@ -0,0 +1,85 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-32965470
+// Cuda compilation tools, release 12.2, V12.2.91
+// Based on NVVM 7.0.1
+//
+
+.version 8.2
+.target sm_90
+.address_size 64
+
+	// .globl	_Z6squarePiS_i
+.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3};
+
+.visible .entry _Z6squarePiS_i(
+	.param .u64 _Z6squarePiS_i_param_0,
+	.param .u64 _Z6squarePiS_i_param_1,
+	.param .u32 _Z6squarePiS_i_param_2
+)
+{
+	.local .align 8 .b8 	__local_depot0[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<10>;
+
+
+	mov.u64 	%SPL, __local_depot0;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [_Z6squarePiS_i_param_0];
+	ld.param.u64 	%rd2, [_Z6squarePiS_i_param_1];
+	ld.param.u32 	%r2, [_Z6squarePiS_i_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$L__BB0_2;
+
+	add.u64 	%rd3, %SP, 0;
+	add.u64 	%rd5, %SPL, 0;
+	// begin inline asm
+	// Before atom_{ref} definition
+	// end inline asm
+	mul.wide.s32 	%rd6, %r1, 4;
+	add.s64 	%rd7, %rd1, %rd6;
+	st.local.u64 	[%rd5], %rd7;
+	// begin inline asm
+	// After atom_{ref} definition
+	// end inline asm
+	cvta.to.global.u64 	%rd8, %rd2;
+	add.s64 	%rd9, %rd8, %rd6;
+	// begin inline asm
+	// split compare_exchange_strong before
+	// end inline asm
+	ld.global.u32 	%r8, [%rd9];
+	mov.u32 	%r6, 1;
+	// begin inline asm
+	// before get underlying atomic ref = %r6
+	// end inline asm
+	// begin inline asm
+	// deref get underlying atomic ref = %rd3
+	// end inline asm
+	// begin inline asm
+	// Inside get underlying atomic (1)
+	// end inline asm
+	ld.local.u64 	%rd4, [%rd5];
+	// begin inline asm
+	// Inside get underlying atomic (2)
+	// end inline asm
+	// begin inline asm
+	// After get underlying atomic
+	// end inline asm
+	// begin inline asm
+	atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
+	// end inline asm
+	st.global.u32 	[%rd9], %r7;
+
+$L__BB0_2:
+	ret;
+
+}
+
diff --git a/test_atomic.ptx b/test_atomic.ptx
@@ -0,0 +1,70 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-32965470
+// Cuda compilation tools, release 12.2, V12.2.91
+// Based on NVVM 7.0.1
+//
+
+.version 8.2
+.target sm_90
+.address_size 64
+
+	// .globl	_Z6squarePiS_i
+.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3};
+
+.visible .entry _Z6squarePiS_i(
+	.param .u64 _Z6squarePiS_i_param_0,
+	.param .u64 _Z6squarePiS_i_param_1,
+	.param .u32 _Z6squarePiS_i_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [_Z6squarePiS_i_param_0];
+	ld.param.u64 	%rd2, [_Z6squarePiS_i_param_1];
+	ld.param.u32 	%r2, [_Z6squarePiS_i_param_2];
+	mov.u32 	%r3, %ntid.x;
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r3, %r4, %r5;
+	setp.ge.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$L__BB0_2;
+
+	cvta.to.global.u64 	%rd5, %rd2;
+	// begin inline asm
+	// Before atom_{ref} definition
+	// end inline asm
+	mul.wide.s32 	%rd6, %r1, 4;
+	add.s64 	%rd4, %rd1, %rd6;
+	// begin inline asm
+	// After atom_{ref} definition
+	// end inline asm
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.u32 	%r8, [%rd7];
+	mov.u32 	%r6, 0;
+	// begin inline asm
+	// before get underlying atomic ref = %r6
+	// end inline asm
+	// begin inline asm
+	// deref get underlying atomic ref = %rd4
+	// end inline asm
+	// begin inline asm
+	// Inside get underlying atomic (non-ref base_impl)
+	// end inline asm
+	// begin inline asm
+	// After get underlying atomic
+	// end inline asm
+	// begin inline asm
+	atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1;
+	// end inline asm
+	st.global.u32 	[%rd7], %r7;
+
+$L__BB0_2:
+	ret;
+
+}
+