-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang][cuda] Add conversion pattern for cuf.kernel_launch op #114129
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
llvmbot
added
flang
Flang issues not falling into any other category
flang:fir-hlfir
labels
Oct 29, 2024
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesFull diff: https://github.com/llvm/llvm-project/pull/114129.diff 2 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 14cc1cb508cfc0..2e1ff203707b22 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -15,6 +15,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/CUFCommon.h"
#include "flang/Runtime/CUDA/allocatable.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
@@ -620,6 +621,69 @@ struct CufDataTransferOpConversion
const mlir::SymbolTable &symtab;
};
+struct CUFLaunchOpConversion
+ : public mlir::OpRewritePattern<cuf::KernelLaunchOp> {
+public:
+ using OpRewritePattern::OpRewritePattern;
+
+ CUFLaunchOpConversion(mlir::MLIRContext *context,
+ const mlir::SymbolTable &symTab)
+ : OpRewritePattern(context), symTab{symTab} {}
+
+ mlir::LogicalResult
+ matchAndRewrite(cuf::KernelLaunchOp op,
+ mlir::PatternRewriter &rewriter) const override {
+ mlir::Location loc = op.getLoc();
+ auto idxTy = mlir::IndexType::get(op.getContext());
+ auto zero = rewriter.create<mlir::arith::ConstantOp>(
+ loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
+ auto gridSizeX =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
+ auto gridSizeY =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY());
+ auto gridSizeZ =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ());
+ auto blockSizeX =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX());
+ auto blockSizeY =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY());
+ auto blockSizeZ =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ());
+ auto kernelName = mlir::SymbolRefAttr::get(
+ rewriter.getStringAttr(cudaDeviceModuleName),
+ {mlir::SymbolRefAttr::get(
+ rewriter.getContext(),
+ op.getCallee().getLeafReference().getValue())});
+ mlir::Value clusterDimX, clusterDimY, clusterDimZ;
+ if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
+ op.getCallee().getLeafReference())) {
+ if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
+ cuf::getClusterDimsAttrName())) {
+ clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getX().getInt());
+ clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getY().getInt());
+ clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getZ().getInt());
+ }
+ }
+ auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
+ loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
+ mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
+ op.getArgs());
+ if (clusterDimX && clusterDimY && clusterDimZ) {
+ gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
+ gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
+ gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
+ }
+ rewriter.replaceOp(op, gpuLaunchOp);
+ return mlir::success();
+ }
+
+private:
+ const mlir::SymbolTable &symTab;
+};
+
class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
public:
void runOnOperation() override {
@@ -637,7 +701,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
/*forceUnifiedTBAATree=*/false, *dl);
- target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>();
+ target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
+ mlir::gpu::GPUDialect>();
cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
patterns);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
@@ -656,5 +721,5 @@ void cuf::populateCUFToFIRConversionPatterns(
patterns.insert<CufAllocOpConversion>(patterns.getContext(), &dl, &converter);
patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
CufFreeOpConversion>(patterns.getContext());
- patterns.insert<CufDataTransferOpConversion>(patterns.getContext(), symtab);
+ patterns.insert<CufDataTransferOpConversion, CUFLaunchOpConversion>(patterns.getContext(), symtab);
}
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
new file mode 100644
index 00000000000000..f11bcbdb7fce55
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -0,0 +1,64 @@
+// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s
+
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QPsub_device1() kernel {
+ cf.br ^bb1
+ ^bb1: // pred: ^bb0
+ gpu.return
+ }
+ gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel {
+ cf.br ^bb1(%arg0 : !fir.ref<f32>)
+ ^bb1(%0: !fir.ref<f32>): // pred: ^bb0
+ %1 = fir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+ %cst = arith.constant 2.000000e+00 : f32
+ fir.store %cst to %1 : !fir.ref<f32>
+ gpu.return
+ }
+ }
+
+ func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+ %0 = fir.alloca f32
+ // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
+ %c1 = arith.constant 1 : index
+ %c11_i32 = arith.constant 11 : i32
+ %c6_i32 = arith.constant 6 : i32
+ %c1_i32 = arith.constant 1 : i32
+ // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}}
+ cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+
+ // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
+ cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
+ return
+ }
+
+}
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>>) kernel {
+ gpu.return
+ }
+ }
+
+ func.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}) attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+ return
+ }
+ func.func @_QMmod1Phost_sub() {
+ %c10 = arith.constant 10 : index
+ %0 = cuf.alloc !fir.array<10xi32> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} -> !fir.ref<!fir.array<10xi32>>
+ %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ %c1_i32 = arith.constant 1 : i32
+ %c10_i32 = arith.constant 10 : i32
+ cuf.kernel_launch @_QMmod1Psub1<<<%c1_i32, %c1_i32, %c1_i32, %c10_i32, %c1_i32, %c1_i32>>>(%2#1) : (!fir.ref<!fir.array<10xi32>>)
+ return
+ }
+}
+
+// CHECK-LABEL: func.func @_QMmod1Phost_sub()
+// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
+
|
Renaud-K
approved these changes
Oct 29, 2024
✅ With the latest revision this PR passed the C/C++ code formatter. |
clementval
force-pushed
the
cuf_launch_pattern
branch
from
October 29, 2024 20:58
822acc7
to
d771323
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.