diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index efef61b5c6e712..8d4a110ee801f0 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -678,6 +678,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [ Arguments<(ins Variadic:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, + Optional:$clusterSizeX, + Optional:$clusterSizeY, + Optional:$clusterSizeZ, Optional:$dynamicSharedMemorySize)>, Results<(outs Optional:$asyncToken)> { let summary = "GPU kernel launch operation"; @@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [ to the amount of dynamic shared memory a kernel's workgroup should be allocated; when this operand is not present, a zero size is assumed. - The body region has at least _twelve_ arguments, grouped as follows: + The body region has at least _twelve_ arguments, or _eighteen_ if cluster + dimensions are present, grouped as follows: + - three optional arguments that contain cluster identifiers along x,y,z + dimensions; - three arguments that contain block identifiers along x,y,z dimensions; - three arguments that contain thread identifiers along x,y,z dimensions; - operands of the `gpu.launch` operation as is (i.e. the operands for @@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ ``` operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )? + ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )? `blocks` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment (dynamic_shared_memory_size ssa-use)? @@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [ // Assuming %val1 is defined outside the gpu.launch region. %42 = load %workgroup[%bx] : memref<32xf32, 3> } + + // Launch with clusters. + gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2) + blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5) + threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8) + { + // Cluster, block and thread identifiers, as well as cluster/block/grid + // sizes are immediately usable inside body region. + "some_op"(%cx, %bx, %tx) : (index, index, index) -> () + } ``` Rationale: using operation/block arguments gives analyses a clear way of @@ -784,7 +801,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [ CArg<"Type", "nullptr">:$asyncTokenType, CArg<"ValueRange", "{}">:$asyncDependencies, CArg<"TypeRange", "{}">:$workgroupAttributions, - CArg<"TypeRange", "{}">:$privateAttributions)> + CArg<"TypeRange", "{}">:$privateAttributions, + CArg<"Value", "nullptr">:$clusterSizeX, + CArg<"Value", "nullptr">:$clusterSizeY, + CArg<"Value", "nullptr">:$clusterSizeZ)> ]; let extraClassDeclaration = [{ @@ -792,17 +812,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [ KernelDim3 getBlockIds(); /// Get the SSA values corresponding to kernel thread identifiers. KernelDim3 getThreadIds(); + /// Get the SSA values corresponding to kernel cluster identifiers. + std::optional getClusterIds(); /// Get the SSA values corresponding to kernel grid size. KernelDim3 getGridSize(); /// Get the SSA values corresponding to kernel block size. KernelDim3 getBlockSize(); + /// Get the SSA values corresponding to kernel cluster size. + std::optional getClusterSize(); /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); + /// Get the SSA values passed as operands to specify the cluster size. + std::optional getClusterSizeOperandValues(); static StringRef getBlocksKeyword() { return "blocks"; } + static StringRef getClustersKeyword() { return "clusters"; } static StringRef getThreadsKeyword() { return "threads"; } static StringRef getDynamicSharedMemorySizeKeyword() { return "dynamic_shared_memory_size"; @@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// placed in the leading positions of the argument list. static constexpr unsigned kNumConfigRegionAttributes = 12; + /// Returns true if cluster size is specified. + bool hasClusterSize() { + if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ()) + return true; + return false; + } + /// Returns the number of operands including cluster size + unsigned getNumConfigOperands() { + return kNumConfigOperands + (hasClusterSize() ? 3 : 0); + } + /// Returns the number of region attributes including cluster size + unsigned getNumConfigRegionAttributes() { + return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0); + } + /// Returns the keywords used in the custom syntax for this Op. static StringRef getWorkgroupKeyword() { return "workgroup"; } static StringRef getPrivateKeyword() { return "private"; } @@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// the workgroup memory ArrayRef getWorkgroupAttributions() { auto begin = - std::next(getBody().args_begin(), kNumConfigRegionAttributes); + std::next(getBody().args_begin(), getNumConfigRegionAttributes()); auto end = std::next(begin, getNumWorkgroupAttributions()); return {begin, end}; } @@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// Returns the number of buffers located in the private memory. unsigned getNumPrivateAttributions() { - return getBody().getNumArguments() - kNumConfigRegionAttributes - + return getBody().getNumArguments() - getNumConfigRegionAttributes() - getNumWorkgroupAttributions(); } @@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ // memory. auto begin = std::next(getBody().args_begin(), - kNumConfigRegionAttributes + getNumWorkgroupAttributions()); + getNumConfigRegionAttributes() + getNumWorkgroupAttributions()); return {begin, getBody().args_end()}; } @@ -871,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ let hasCanonicalizer = 1; let hasCustomAssemblyFormat = 1; let hasRegionVerifier = 1; + let hasVerifier = 1; } def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index dd482f305fcbc8..020900934c9f72 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, Value getBlockSizeZ, Value dynamicSharedMemorySize, Type asyncTokenType, ValueRange asyncDependencies, TypeRange workgroupAttributions, - TypeRange privateAttributions) { + TypeRange privateAttributions, Value clusterSizeX, + Value clusterSizeY, Value clusterSizeZ) { // Add a WorkGroup attribution attribute. This attribute is required to // identify private attributions in the list of block argguments. result.addAttribute(getNumWorkgroupAttributionsAttrName(), @@ -660,6 +661,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, // Add grid and block sizes as op operands, followed by the data operands. result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX, getBlockSizeY, getBlockSizeZ}); + if (clusterSizeX) + result.addOperands(clusterSizeX); + if (clusterSizeY) + result.addOperands(clusterSizeY); + if (clusterSizeZ) + result.addOperands(clusterSizeZ); if (dynamicSharedMemorySize) result.addOperands(dynamicSharedMemorySize); @@ -678,9 +685,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, body->addArgument(argTy, result.location); kernelRegion->push_back(body); // Fill OperandSegmentSize Attribute. - SmallVector segmentSizes(8, 1); + SmallVector segmentSizes(11, 1); segmentSizes.front() = asyncDependencies.size(); segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; + segmentSizes[7] = clusterSizeX ? 1 : 0; + segmentSizes[8] = clusterSizeY ? 1 : 0; + segmentSizes[9] = clusterSizeZ ? 1 : 0; result.addAttribute(getOperandSegmentSizeAttr(), builder.getDenseI32ArrayAttr(segmentSizes)); } @@ -709,6 +719,22 @@ KernelDim3 LaunchOp::getBlockSize() { return KernelDim3{args[9], args[10], args[11]}; } +std::optional LaunchOp::getClusterIds() { + assert(!getBody().empty() && "LaunchOp body must not be empty."); + if (!hasClusterSize()) + return std::nullopt; + auto args = getBody().getArguments(); + return KernelDim3{args[12], args[13], args[14]}; +} + +std::optional LaunchOp::getClusterSize() { + assert(!getBody().empty() && "LaunchOp body must not be empty."); + if (!hasClusterSize()) + return std::nullopt; + auto args = getBody().getArguments(); + return KernelDim3{args[15], args[16], args[17]}; +} + KernelDim3 LaunchOp::getGridSizeOperandValues() { auto operands = getOperands().drop_front(getAsyncDependencies().size()); return KernelDim3{operands[0], operands[1], operands[2]}; @@ -719,6 +745,20 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() { return KernelDim3{operands[3], operands[4], operands[5]}; } +std::optional LaunchOp::getClusterSizeOperandValues() { + auto operands = getOperands().drop_front(getAsyncDependencies().size()); + if (!hasClusterSize()) + return std::nullopt; + return KernelDim3{operands[6], operands[7], operands[8]}; +} + +LogicalResult LaunchOp::verify() { + if (!(hasClusterSize()) && + (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ())) + return emitOpError() << "cluster size must be all present"; + return success(); +} + LogicalResult LaunchOp::verifyRegions() { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments @@ -778,6 +818,12 @@ void LaunchOp::print(OpAsmPrinter &p) { p << " [" << getAsyncDependencies() << ']'; } // Print the launch configuration. + if (hasClusterSize()) { + p << ' ' << getClustersKeyword(); + printSizeAssignment(p, getClusterSize().value(), + getClusterSizeOperandValues().value(), + getClusterIds().value()); + } p << ' ' << getBlocksKeyword(); printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(), getBlockIds()); @@ -831,6 +877,7 @@ parseSizeAssignment(OpAsmParser &parser, /// Parses a Launch operation. /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? +/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional) /// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment /// `threads` `(` ssa-id-list `)` `in` ssa-reassignment /// memory-attribution @@ -840,7 +887,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. SmallVector sizes(LaunchOp::kNumConfigOperands); - MutableArrayRef sizesRef(sizes); // Actual (data) operands passed to the kernel. SmallVector dataOperands; @@ -848,7 +894,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Region arguments to be created. SmallVector regionArgs( LaunchOp::kNumConfigRegionAttributes); - MutableArrayRef regionArgsRef(regionArgs); // Parse optional async dependencies. SmallVector asyncDependencies; @@ -861,6 +906,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { if (parser.getNumResults() > 0) result.types.push_back(asyncTokenType); + bool hasCluster = false; + if (succeeded( + parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) { + hasCluster = true; + sizes.resize(9); + regionArgs.resize(18); + } + MutableArrayRef sizesRef(sizes); + MutableArrayRef regionArgsRef(regionArgs); + + // Last three segment assigns the cluster size. In the region argument + // list, this is last 6 arguments. + if (hasCluster) { + if (parseSizeAssignment(parser, sizesRef.drop_front(6), + regionArgsRef.slice(15, 3), + regionArgsRef.slice(12, 3))) + return failure(); + } // Parse the size assignment segments: the first segment assigns grid sizes // and defines values for block identifiers; the second segment assigns block // sizes and defines values for thread identifiers. In the region argument @@ -898,7 +961,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // LaunchOp::getNumWorkgroupAttributionsAttrName(). Type index = parser.getBuilder().getIndexType(); SmallVector dataTypes( - LaunchOp::kNumConfigRegionAttributes, index); + LaunchOp::kNumConfigRegionAttributes + 6, index); SmallVector regionArguments; for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) { @@ -916,8 +979,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Store the number of operands we just parsed as the number of workgroup // memory attributions. - unsigned numWorkgroupAttrs = - regionArguments.size() - LaunchOp::kNumConfigRegionAttributes; + unsigned numWorkgroupAttrs = regionArguments.size() - + LaunchOp::kNumConfigRegionAttributes - + (hasCluster ? 6 : 0); result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(), builder.getI64IntegerAttr(numWorkgroupAttrs)); @@ -934,8 +998,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { parser.parseOptionalAttrDict(result.attributes)) return failure(); - SmallVector segmentSizes(8, 1); + SmallVector segmentSizes(11, 1); segmentSizes.front() = asyncDependencies.size(); + + if (!hasCluster) { + segmentSizes[7] = 0; + segmentSizes[8] = 0; + segmentSizes[9] = 0; + } segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0; result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(), parser.getBuilder().getDenseI32ArrayAttr(segmentSizes)); @@ -992,7 +1062,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) { (*this)->setAttr(attrName, IntegerAttr::get(attr.getType(), attr.getValue() + 1)); return getBody().insertArgument( - LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc); + LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc); } /// Adds a new block argument that corresponds to buffers located in diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 7432a58f18b442..2436113dc4239c 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc, /// entry block of `launchOpBody`, to the corresponding result value of the /// added operations. static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, - Region &launchOpBody, IRMapping &map) { + Region &launchOpBody, IRMapping &map, + bool hasCluster = false) { OpBuilder builder(loc->getContext()); Block &firstBlock = launchOpBody.front(); builder.setInsertionPointToStart(&launchFuncOpBody.front()); - SmallVector indexOps; + SmallVector indexOps; + // The order is important here, as it must match the order of the arguments createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); + if (hasCluster) { + createForAllDimensions(builder, loc, indexOps); + createForAllDimensions(builder, loc, indexOps); + } // Replace the leading 12 function args with the respective thread/block index // operations. Iterate backwards since args are erased and indices change. for (const auto &indexOp : enumerate(indexOps)) @@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, IRMapping map; // Map the arguments corresponding to the launch parameters like blockIdx, - // threadIdx, etc. + // threadIdx, etc. If cluster is present, then we also generate clusterIdx and + // clusterDim. Region &outlinedFuncBody = outlinedFunc.getBody(); - injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); + injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map, + launchOp.hasClusterSize()); // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. for (const auto &[launchArg, funcArg] : @@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, // The launch op has an optional dynamic shared memory size. If it doesn't // exist, we use zero. Value asyncToken = launchOp.getAsyncToken(); + std::optional clusterSize = + launchOp.getClusterSizeOperandValues(); auto launchFunc = builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), launchOp.getDynamicSharedMemorySize(), operands, asyncToken ? asyncToken.getType() : nullptr, - launchOp.getAsyncDependencies()); + launchOp.getAsyncDependencies(), clusterSize); launchOp.replaceAllUsesWith(launchFunc); launchOp.erase(); } diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir index a058365a104a1f..79eef8ae7eb856 100644 --- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir +++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir @@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref, %B : memref) { // CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index // CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index - // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) - // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) + // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) + // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) affine.for %i = 0 to 42 { // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]] // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]] diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 8a34d64326072b..4d3a898fdd1565 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) { ^bb1(%bx: index, %by: index, %bz: index, %tx: index, %ty: index, %tz: index): gpu.terminator - }) {operandSegmentSizes = array} : (index, index, index, index, index, index) -> () + }) {operandSegmentSizes = array} : (index, index, index, index, index, index) -> () return } diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 8020f6dfa65b74..601add9a9f91c0 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -407,3 +407,77 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { } // CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} + +// ----- +// CHECK: module attributes {gpu.container_module} + +// CHECK-LABEL: func @launch_cluster() +func.func @launch_cluster() { + // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 + %0 = "op"() : () -> (f32) + // CHECK: %[[ARG1:.*]] = "op"() : () -> memref + %1 = "op"() : () -> (memref) + // CHECK: %[[CDIMX:.*]] = arith.constant 1 + %cDimX = arith.constant 1 : index + // CHECK: %[[CDIMY:.*]] = arith.constant 2 + %cDimY = arith.constant 2 : index + // CHECK: %[[CDIMZ:.*]] = arith.constant 1 + %cDimZ = arith.constant 1 : index + // CHECK: %[[GDIMX:.*]] = arith.constant 8 + %gDimX = arith.constant 8 : index + // CHECK: %[[GDIMY:.*]] = arith.constant 12 + %gDimY = arith.constant 12 : index + // CHECK: %[[GDIMZ:.*]] = arith.constant 16 + %gDimZ = arith.constant 16 : index + // CHECK: %[[BDIMX:.*]] = arith.constant 20 + %bDimX = arith.constant 20 : index + // CHECK: %[[BDIMY:.*]] = arith.constant 24 + %bDimY = arith.constant 24 : index + // CHECK: %[[BDIMZ:.*]] = arith.constant 28 + %bDimZ = arith.constant 28 : index + + // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref) + // CHECK-NOT: gpu.launch blocks + gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY, + %cluster_z = %cDimZ) + blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, + %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, + %block_z = %bDimZ) { + "use"(%0): (f32) -> () + "some_op"(%cx, %bx, %block_x) : (index, index, index) -> () + %42 = memref.load %1[%tx] : memref + gpu.terminator + } + return +} + +// CHECK-LABEL: gpu.module @launch_cluster_kernel +// CHECK-NEXT: gpu.func @launch_cluster_kernel +// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) +// CHECK-SAME: gpu.known_block_size = array +// CHECK-SAME: gpu.known_grid_size = array +// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x +// CHECK-NEXT: = gpu.block_id y +// CHECK-NEXT: = gpu.block_id z +// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x +// CHECK-NEXT: = gpu.thread_id y +// CHECK-NEXT: = gpu.thread_id z +// CHECK-NEXT: = gpu.grid_dim x +// CHECK-NEXT: = gpu.grid_dim y +// CHECK-NEXT: = gpu.grid_dim z +// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x +// CHECK-NEXT: = gpu.block_dim y +// CHECK-NEXT: = gpu.block_dim z +// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x +// CHECK-NEXT: = gpu.cluster_id y +// CHECK-NEXT: = gpu.cluster_id z +// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x +// CHECK-NEXT: = gpu.cluster_dim y +// CHECK-NEXT: = gpu.cluster_dim z +// CHECK-NEXT: cf.br ^[[BLOCK:.*]] +// CHECK-NEXT: ^[[BLOCK]]: +// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () +// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> () +// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref +