Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flang][cuda] Add conversion pattern for cuf.kernel_launch op #114129

Merged
merged 1 commit into from
Oct 30, 2024

Conversation

clementval
Copy link
Contributor

No description provided.

@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Oct 29, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Oct 29, 2024

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/114129.diff

2 Files Affected:

  • (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+67-2)
  • (added) flang/test/Fir/CUDA/cuda-launch.fir (+64)
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 14cc1cb508cfc0..2e1ff203707b22 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -15,6 +15,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Runtime/CUDA/allocatable.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -620,6 +621,69 @@ struct CufDataTransferOpConversion
   const mlir::SymbolTable &symtab;
 };
 
+struct CUFLaunchOpConversion
+    : public mlir::OpRewritePattern<cuf::KernelLaunchOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  CUFLaunchOpConversion(mlir::MLIRContext *context,
+                        const mlir::SymbolTable &symTab)
+      : OpRewritePattern(context), symTab{symTab} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::KernelLaunchOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    auto idxTy = mlir::IndexType::get(op.getContext());
+    auto zero = rewriter.create<mlir::arith::ConstantOp>(
+        loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
+    auto gridSizeX =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
+    auto gridSizeY =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY());
+    auto gridSizeZ =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ());
+    auto blockSizeX =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX());
+    auto blockSizeY =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY());
+    auto blockSizeZ =
+        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ());
+    auto kernelName = mlir::SymbolRefAttr::get(
+        rewriter.getStringAttr(cudaDeviceModuleName),
+        {mlir::SymbolRefAttr::get(
+            rewriter.getContext(),
+            op.getCallee().getLeafReference().getValue())});
+    mlir::Value clusterDimX, clusterDimY, clusterDimZ;
+    if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
+            op.getCallee().getLeafReference())) {
+      if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
+              cuf::getClusterDimsAttrName())) {
+        clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>(
+            loc, clusterDimsAttr.getX().getInt());
+        clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>(
+            loc, clusterDimsAttr.getY().getInt());
+        clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
+            loc, clusterDimsAttr.getZ().getInt());
+      }
+    }
+    auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
+        loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
+        op.getArgs());
+    if (clusterDimX && clusterDimY && clusterDimZ) {
+      gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
+      gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
+      gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
+    }
+    rewriter.replaceOp(op, gpuLaunchOp);
+    return mlir::success();
+  }
+
+private:
+  const mlir::SymbolTable &symTab;
+};
+
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
 public:
   void runOnOperation() override {
@@ -637,7 +701,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
         fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
-    target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>();
+    target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
+                           mlir::gpu::GPUDialect>();
     cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
                                             patterns);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
@@ -656,5 +721,5 @@ void cuf::populateCUFToFIRConversionPatterns(
   patterns.insert<CufAllocOpConversion>(patterns.getContext(), &dl, &converter);
   patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
                   CufFreeOpConversion>(patterns.getContext());
-  patterns.insert<CufDataTransferOpConversion>(patterns.getContext(), symtab);
+  patterns.insert<CufDataTransferOpConversion, CUFLaunchOpConversion>(patterns.getContext(), symtab);
 }
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
new file mode 100644
index 00000000000000..f11bcbdb7fce55
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -0,0 +1,64 @@
+// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s
+
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QPsub_device1() kernel {
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      gpu.return
+    }
+    gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel {
+      cf.br ^bb1(%arg0 : !fir.ref<f32>)
+    ^bb1(%0: !fir.ref<f32>):  // pred: ^bb0
+      %1 = fir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+      %cst = arith.constant 2.000000e+00 : f32
+      fir.store %cst to %1 : !fir.ref<f32>
+      gpu.return
+    }
+  }
+
+  func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = fir.alloca f32
+    // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
+    %c1 = arith.constant 1 : index
+    %c11_i32 = arith.constant 11 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}}
+    cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+
+    // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
+    cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
+    return
+  }
+
+}
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>>) kernel {
+      gpu.return
+    }
+  }
+
+  func.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}) attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+    return
+  }
+  func.func @_QMmod1Phost_sub() {
+    %c10 = arith.constant 10 : index
+    %0 = cuf.alloc !fir.array<10xi32> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} -> !fir.ref<!fir.array<10xi32>>
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    cuf.kernel_launch @_QMmod1Psub1<<<%c1_i32, %c1_i32, %c1_i32, %c10_i32, %c1_i32, %c1_i32>>>(%2#1) : (!fir.ref<!fir.array<10xi32>>)
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QMmod1Phost_sub()
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
+

Copy link

github-actions bot commented Oct 29, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

@clementval clementval merged commit 0fa2fb3 into llvm:main Oct 30, 2024
8 checks passed
@clementval clementval deleted the cuf_launch_pattern branch October 30, 2024 00:00
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:fir-hlfir flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants