Skip to content

Commit

Permalink
Add cuda::ptx:mbarrier_{try/test}_wait{_parity} (#674)
Browse files Browse the repository at this point in the history
* Add mbarrier.test_wait/try_wait exposure

* Fix link

* Move mbarrier.test_wait section down

Also some fixes to linking and formatting.

* Fix availability

* Fix test
  • Loading branch information
ahendriksen authored Nov 15, 2023
1 parent af1641f commit 2776423
Show file tree
Hide file tree
Showing 3 changed files with 886 additions and 13 deletions.
144 changes: 131 additions & 13 deletions libcudacxx/docs/extended_api/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -673,18 +673,18 @@ __device__ static inline void red_async(

### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)

| Instruction | Available in libcu++ |
|------------------------------------------|----------------------|
| [`mbarrier.init`] | No |
| [`mbarrier.inval`] | No |
| [`mbarrier.expect_tx`] | No |
| [`mbarrier.complete_tx`] | No |
| Instruction | Available in libcu++ |
|------------------------------------------|-------------------------|
| [`mbarrier.init`] | No |
| [`mbarrier.inval`] | No |
| [`mbarrier.expect_tx`] | No |
| [`mbarrier.complete_tx`] | No |
| [`mbarrier.arrive`] | CTK-FUTURE, CCCL v2.3.0 |
| [`mbarrier.arrive_drop`] | No |
| [`cp.async.mbarrier.arrive`] | No |
| [`mbarrier.test_wait/mbarrier.try_wait`] | No |
| [`mbarrier.pending_count`] | No |
| [`tensormap.cp_fenceproxy`] | No |
| [`mbarrier.arrive_drop`] | No |
| [`cp.async.mbarrier.arrive`] | No |
| [`mbarrier.test_wait/mbarrier.try_wait`] | CTK-FUTURE, CCCL v2.3.0 |
| [`mbarrier.pending_count`] | No |
| [`tensormap.cp_fenceproxy`] | No |

[`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
[`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
Expand All @@ -693,14 +693,15 @@ __device__ static inline void red_async(
[`mbarrier.arrive`]: #mbarrierarrive
[`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
[`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
[`mbarrier.test_wait/mbarrier.try_wait`]: #mbarriertest_waitmbarriertry_wait
[`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy



#### `mbarrier.arrive`

- PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
- PTX ISA: [`mbarrier.arrive`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)

```cuda
// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80
Expand Down Expand Up @@ -834,6 +835,123 @@ __global__ void kernel() {
)
}
```

#### `mbarrier.test_wait/mbarrier.try_wait`

- PTX ISA: [`mbarrier.test_wait/mbarrier.try_wait`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait)

**mbarrier_test_wait**:
```cuda
// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80
template <typename=void>
__device__ static inline bool mbarrier_test_wait(
uint64_t* addr,
const uint64_t& state);
// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_test_wait(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint64_t& state);
```

**mbarrier_test_wait_parity**:
```cuda
// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80
template <typename=void>
__device__ static inline bool mbarrier_test_wait_parity(
uint64_t* addr,
const uint32_t& phaseParity);
// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_test_wait_parity(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint32_t& phaseParity);
```

**mbarrier_try_wait**:
```cuda
// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90
template <typename=void>
__device__ static inline bool mbarrier_try_wait(
uint64_t* addr,
const uint64_t& state);
// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90
template <typename=void>
__device__ static inline bool mbarrier_try_wait(
uint64_t* addr,
const uint64_t& state,
const uint32_t& suspendTimeHint);
// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint64_t& state);
// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_try_wait(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint64_t& state,
const uint32_t& suspendTimeHint);
```

**mbarrier_try_wait_parity**:
```cuda
// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90
template <typename=void>
__device__ static inline bool mbarrier_try_wait_parity(
uint64_t* addr,
const uint32_t& phaseParity);
// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90
template <typename=void>
__device__ static inline bool mbarrier_try_wait_parity(
uint64_t* addr,
const uint32_t& phaseParity,
const uint32_t& suspendTimeHint);
// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint32_t& phaseParity);
// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90
// .sem = { .acquire }
// .scope = { .cta, .cluster }
template <cuda::ptx::dot_scope Scope>
__device__ static inline bool mbarrier_try_wait_parity(
cuda::ptx::sem_acquire_t,
cuda::ptx::scope_t<Scope> scope,
uint64_t* addr,
const uint32_t& phaseParity,
const uint32_t& suspendTimeHint);
```

### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions)

| Instruction | Available in libcu++ |
Expand Down
Loading

0 comments on commit 2776423

Please sign in to comment.