From dc01b597baa63cdcd1132d3260f9642cdd37f5a7 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Mon, 15 Jan 2024 19:22:30 +0530 Subject: [PATCH] [MLIR][NVVM] Add support for aligned variants of cluster barriers (#78142) This patch adds: * Support for the 'aligned' variants of the cluster barrier Ops, by extending the existing Op with an 'aligned' attribute. * Docs for these Ops. * Test cases to verify the lowering to the corresponding intrinsics. Signed-off-by: Durgadoss R --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 58 +++++++++++++++++++-- mlir/test/Dialect/LLVMIR/nvvm.mlir | 6 +++ mlir/test/Target/LLVMIR/nvvmir.mlir | 27 ++++++++++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index c5f68a2ebe3952..7140e614412f98 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -378,22 +378,74 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { } def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> { + let arguments = (ins OptionalAttr:$aligned); + + let summary = "Cluster Barrier Arrive Op"; + let description = [{ + The `cluster.arrive` can be used by the threads within the cluster for synchronization and + communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier + without causing the executing thread to wait for other participating threads. + + The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction. + + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive); + if ($aligned) + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_aligned); + else + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive); }]; let assemblyFormat = "attr-dict"; } def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> { + let arguments = (ins OptionalAttr:$aligned); + + let summary = "Cluster Barrier Relaxed Arrive Op"; + let description = [{ + The `cluster.arrive` can be used by the threads within the cluster for synchronization and + communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier + without causing the executing thread to wait for other participating threads. + + The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction. + The .relaxed qualifier on `cluster.arrive` specifies that there are no memory + ordering and visibility guarantees provided for the memory accesses performed prior to + `cluster.arrive`. + + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed); + if ($aligned) + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned); + else + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed); }]; let assemblyFormat = "attr-dict"; } def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> { + let arguments = (ins OptionalAttr:$aligned); + + let summary = "Cluster Barrier Wait Op"; + let description = [{ + The `cluster.wait` causes the executing thread to wait for all non-exited threads + of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided, + generates the .aligned version of the PTX instruction. + + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait); + if ($aligned) + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait_aligned); + else + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait); }]; let assemblyFormat = "attr-dict"; } diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 39516b5090d07b..ce483ddab22a0e 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -47,6 +47,8 @@ func.func @llvm_nvvm_barrier0() { func.func @llvm_nvvm_cluster_arrive() { // CHECK: nvvm.cluster.arrive nvvm.cluster.arrive + // CHECK: nvvm.cluster.arrive {aligned} + nvvm.cluster.arrive {aligned} llvm.return } @@ -54,6 +56,8 @@ func.func @llvm_nvvm_cluster_arrive() { func.func @llvm_nvvm_cluster_arrive_relaxed() { // CHECK: nvvm.cluster.arrive.relaxed nvvm.cluster.arrive.relaxed + // CHECK: nvvm.cluster.arrive.relaxed {aligned} + nvvm.cluster.arrive.relaxed {aligned} llvm.return } @@ -61,6 +65,8 @@ func.func @llvm_nvvm_cluster_arrive_relaxed() { func.func @llvm_nvvm_cluster_wait() { // CHECK: nvvm.cluster.wait nvvm.cluster.wait + // CHECK: nvvm.cluster.wait {aligned} + nvvm.cluster.wait {aligned} llvm.return } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 423b1a133a4ae2..8c5e3524a848f6 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -80,6 +80,33 @@ llvm.func @llvm_nvvm_barrier0() { llvm.return } +// CHECK-LABEL: @llvm_nvvm_cluster_arrive +llvm.func @llvm_nvvm_cluster_arrive() { + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive() + nvvm.cluster.arrive + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.aligned() + nvvm.cluster.arrive {aligned} + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_cluster_arrive_relaxed +llvm.func @llvm_nvvm_cluster_arrive_relaxed() { + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed() + nvvm.cluster.arrive.relaxed + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed.aligned() + nvvm.cluster.arrive.relaxed {aligned} + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_cluster_wait +llvm.func @llvm_nvvm_cluster_wait() { + // CHECK: call void @llvm.nvvm.barrier.cluster.wait() + nvvm.cluster.wait + // CHECK: call void @llvm.nvvm.barrier.cluster.wait.aligned() + nvvm.cluster.wait {aligned} + llvm.return +} + // CHECK-LABEL: @nvvm_shfl llvm.func @nvvm_shfl( %0 : i32, %1 : i32, %2 : i32,