forked from NVIDIA/cccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Debug atomic ref generating st.local and ld.local
The problem seems to be that in the path to the actual PTX instruction, a volatile pointer is dereferenced, which causes a spill to local.
- Loading branch information
1 parent
74f1160
commit c9756c0
Showing
7 changed files
with
214 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
|
||
#include<cuda/atomic> | ||
|
||
template <typename T, typename V> union U { T t; V v; }; | ||
using atom_t = cuda::atomic<int, cuda::thread_scope_device>*; | ||
using aref_t = cuda::atomic_ref<int, cuda::thread_scope_device>; | ||
|
||
// Type your code here, or load an example. | ||
__global__ void square(int* data, | ||
int* array, | ||
int n) { | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid < n) { | ||
asm volatile("// Before atom_{ref} definition" ::: "memory"); | ||
// Spill to local happens here (for atomic_ref). (st.local) | ||
#ifdef AREF | ||
auto ref = aref_t{*(data + tid)}; | ||
#else | ||
auto& ref = *U<atom_t, aref_t>{ .v = aref_t{*(data + tid)} }.t; | ||
#endif | ||
asm volatile("// After atom_{ref} definition" ::: "memory"); | ||
|
||
ref.compare_exchange_strong(array[tid], tid, cuda::std::memory_order_acquire); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
// | ||
// Generated by NVIDIA NVVM Compiler | ||
// | ||
// Compiler Build ID: CL-32965470 | ||
// Cuda compilation tools, release 12.2, V12.2.91 | ||
// Based on NVVM 7.0.1 | ||
// | ||
|
||
.version 8.2 | ||
.target sm_90 | ||
.address_size 64 | ||
|
||
// .globl _Z6squarePiS_i | ||
.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3}; | ||
|
||
.visible .entry _Z6squarePiS_i( | ||
.param .u64 _Z6squarePiS_i_param_0, | ||
.param .u64 _Z6squarePiS_i_param_1, | ||
.param .u32 _Z6squarePiS_i_param_2 | ||
) | ||
{ | ||
.local .align 8 .b8 __local_depot0[8]; | ||
.reg .b64 %SP; | ||
.reg .b64 %SPL; | ||
.reg .pred %p<2>; | ||
.reg .b32 %r<10>; | ||
.reg .b64 %rd<10>; | ||
|
||
|
||
mov.u64 %SPL, __local_depot0; | ||
cvta.local.u64 %SP, %SPL; | ||
ld.param.u64 %rd1, [_Z6squarePiS_i_param_0]; | ||
ld.param.u64 %rd2, [_Z6squarePiS_i_param_1]; | ||
ld.param.u32 %r2, [_Z6squarePiS_i_param_2]; | ||
mov.u32 %r3, %ctaid.x; | ||
mov.u32 %r4, %ntid.x; | ||
mov.u32 %r5, %tid.x; | ||
mad.lo.s32 %r1, %r4, %r3, %r5; | ||
setp.ge.s32 %p1, %r1, %r2; | ||
@%p1 bra $L__BB0_2; | ||
|
||
add.u64 %rd3, %SP, 0; | ||
add.u64 %rd5, %SPL, 0; | ||
// begin inline asm | ||
// Before atom_{ref} definition | ||
// end inline asm | ||
mul.wide.s32 %rd6, %r1, 4; | ||
add.s64 %rd7, %rd1, %rd6; | ||
st.local.u64 [%rd5], %rd7; | ||
// begin inline asm | ||
// After atom_{ref} definition | ||
// end inline asm | ||
cvta.to.global.u64 %rd8, %rd2; | ||
add.s64 %rd9, %rd8, %rd6; | ||
// begin inline asm | ||
// split compare_exchange_strong before | ||
// end inline asm | ||
ld.global.u32 %r8, [%rd9]; | ||
mov.u32 %r6, 1; | ||
// begin inline asm | ||
// before get underlying atomic ref = %r6 | ||
// end inline asm | ||
// begin inline asm | ||
// deref get underlying atomic ref = %rd3 | ||
// end inline asm | ||
// begin inline asm | ||
// Inside get underlying atomic (1) | ||
// end inline asm | ||
ld.local.u64 %rd4, [%rd5]; | ||
// begin inline asm | ||
// Inside get underlying atomic (2) | ||
// end inline asm | ||
// begin inline asm | ||
// After get underlying atomic | ||
// end inline asm | ||
// begin inline asm | ||
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1; | ||
// end inline asm | ||
st.global.u32 [%rd9], %r7; | ||
|
||
$L__BB0_2: | ||
ret; | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
// | ||
// Generated by NVIDIA NVVM Compiler | ||
// | ||
// Compiler Build ID: CL-32965470 | ||
// Cuda compilation tools, release 12.2, V12.2.91 | ||
// Based on NVVM 7.0.1 | ||
// | ||
|
||
.version 8.2 | ||
.target sm_90 | ||
.address_size 64 | ||
|
||
// .globl _Z6squarePiS_i | ||
.global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3}; | ||
|
||
.visible .entry _Z6squarePiS_i( | ||
.param .u64 _Z6squarePiS_i_param_0, | ||
.param .u64 _Z6squarePiS_i_param_1, | ||
.param .u32 _Z6squarePiS_i_param_2 | ||
) | ||
{ | ||
.reg .pred %p<2>; | ||
.reg .b32 %r<10>; | ||
.reg .b64 %rd<8>; | ||
|
||
|
||
ld.param.u64 %rd1, [_Z6squarePiS_i_param_0]; | ||
ld.param.u64 %rd2, [_Z6squarePiS_i_param_1]; | ||
ld.param.u32 %r2, [_Z6squarePiS_i_param_2]; | ||
mov.u32 %r3, %ntid.x; | ||
mov.u32 %r4, %ctaid.x; | ||
mov.u32 %r5, %tid.x; | ||
mad.lo.s32 %r1, %r3, %r4, %r5; | ||
setp.ge.s32 %p1, %r1, %r2; | ||
@%p1 bra $L__BB0_2; | ||
|
||
cvta.to.global.u64 %rd5, %rd2; | ||
// begin inline asm | ||
// Before atom_{ref} definition | ||
// end inline asm | ||
mul.wide.s32 %rd6, %r1, 4; | ||
add.s64 %rd4, %rd1, %rd6; | ||
// begin inline asm | ||
// After atom_{ref} definition | ||
// end inline asm | ||
add.s64 %rd7, %rd5, %rd6; | ||
ld.global.u32 %r8, [%rd7]; | ||
mov.u32 %r6, 0; | ||
// begin inline asm | ||
// before get underlying atomic ref = %r6 | ||
// end inline asm | ||
// begin inline asm | ||
// deref get underlying atomic ref = %rd4 | ||
// end inline asm | ||
// begin inline asm | ||
// Inside get underlying atomic (non-ref base_impl) | ||
// end inline asm | ||
// begin inline asm | ||
// After get underlying atomic | ||
// end inline asm | ||
// begin inline asm | ||
atom.cas.acquire.gpu.b32 %r7,[%rd4],%r8,%r1; | ||
// end inline asm | ||
st.global.u32 [%rd7], %r7; | ||
|
||
$L__BB0_2: | ||
ret; | ||
|
||
} | ||
|