Skip to content

Commit

Permalink
PTX: make cp_async_bulk*_multicast functions sm_90a (#1734)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed May 13, 2024
1 parent e9fe09b commit f8a26b2
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 36 deletions.
6 changes: 5 additions & 1 deletion libcudacxx/docs/ptx/instructions/cp.async.bulk.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@

**NOTE.** Both `srcMem` and `dstMem` must be 16-byte aligned, and `size` must be a multiple of 16.

## Changelog

- In earlier versions, `cp_async_bulk_multicast` was enabled for SM_90. This has been changed to SM_90a.

## Unicast

| C++ | PTX |
Expand Down Expand Up @@ -79,7 +83,7 @@ __device__ static inline void cp_async_bulk(
### [(0)](#0-cp_async_bulk_multicast) `cp_async_bulk_multicast`
{: .no_toc }
```cuda
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand Down
14 changes: 9 additions & 5 deletions libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
{:toc}
</details>

## Changelog

- In earlier versions, `cp_async_bulk_tensor_multicast` was enabled for SM_90. This has been changed to SM_90a.

## Unicast

| C++ | PTX |
Expand Down Expand Up @@ -194,7 +198,7 @@ __device__ static inline void cp_async_bulk_tensor(
### [(0)](#0-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
{: .no_toc }
```cuda
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -211,7 +215,7 @@ __device__ static inline void cp_async_bulk_tensor(
### [(1)](#1-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
{: .no_toc }
```cuda
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -228,7 +232,7 @@ __device__ static inline void cp_async_bulk_tensor(
### [(2)](#2-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
{: .no_toc }
```cuda
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -245,7 +249,7 @@ __device__ static inline void cp_async_bulk_tensor(
### [(3)](#3-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
{: .no_toc }
```cuda
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -262,7 +266,7 @@ __device__ static inline void cp_async_bulk_tensor(
### [(4)](#4-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
{: .no_toc }
```cuda
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand Down
8 changes: 4 additions & 4 deletions libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
#endif // __cccl_ptx_isa >= 800
/*
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
ctaMask; // 1. PTX ISA 80, SM_90
ctaMask; // 1. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -159,7 +159,7 @@ __device__ static inline void cp_async_bulk(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
Expand All @@ -173,7 +173,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
"%4; // 1. "
:
Expand All @@ -185,7 +185,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

Expand Down
40 changes: 20 additions & 20 deletions libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
#endif // __cccl_ptx_isa >= 800
/*
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90
tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -464,7 +464,7 @@ __device__ static inline void cp_async_bulk_tensor(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_tensor(
space_cluster_t,
Expand All @@ -478,7 +478,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
"[%1, {%2}], [%3], %4; // 2a."
:
Expand All @@ -490,13 +490,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90
tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -510,7 +510,7 @@ __device__ static inline void cp_async_bulk_tensor(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_tensor(
space_cluster_t,
Expand All @@ -524,7 +524,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
"[%1, {%2, %3}], [%4], %5; // 2b."
:
Expand All @@ -537,13 +537,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90
tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -557,7 +557,7 @@ __device__ static inline void cp_async_bulk_tensor(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_tensor(
space_cluster_t,
Expand All @@ -571,7 +571,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
"[%1, {%2, %3, %4}], [%5], %6; // 2c."
:
Expand All @@ -585,13 +585,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90
tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -605,7 +605,7 @@ __device__ static inline void cp_async_bulk_tensor(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_tensor(
space_cluster_t,
Expand All @@ -619,7 +619,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
"[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
:
Expand All @@ -634,13 +634,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90
tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
Expand All @@ -654,7 +654,7 @@ __device__ static inline void cp_async_bulk_tensor(
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_tensor(
space_cluster_t,
Expand All @@ -668,7 +668,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
"[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
:
Expand All @@ -684,7 +684,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
__cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
// size, [smem_bar], ctaMask; // 1.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
Expand All @@ -54,7 +54,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
Expand All @@ -70,7 +70,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
Expand All @@ -86,7 +86,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
Expand All @@ -102,7 +102,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
Expand Down

0 comments on commit f8a26b2

Please sign in to comment.