Skip to content

Commit

Permalink
[mlir][gpu] Add Support for Cluster of Thread Blocks in gpu.launch (l…
Browse files Browse the repository at this point in the history
  • Loading branch information
grypp authored Jan 6, 2024
1 parent ab073cb commit 5b33cff
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 22 deletions.
53 changes: 48 additions & 5 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
Optional<Index>:$clusterSizeX,
Optional<Index>:$clusterSizeY,
Optional<Index>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "GPU kernel launch operation";
Expand All @@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
to the amount of dynamic shared memory a kernel's workgroup should be
allocated; when this operand is not present, a zero size is assumed.

The body region has at least _twelve_ arguments, grouped as follows:
The body region has at least _twelve_ arguments, or _eighteen_ if cluster
dimensions are present, grouped as follows:

- three optional arguments that contain cluster identifiers along x,y,z
dimensions;
- three arguments that contain block identifiers along x,y,z dimensions;
- three arguments that contain thread identifiers along x,y,z dimensions;
- operands of the `gpu.launch` operation as is (i.e. the operands for
Expand All @@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [

```
operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
`blocks` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(dynamic_shared_memory_size ssa-use)?
Expand Down Expand Up @@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// Assuming %val1 is defined outside the gpu.launch region.
%42 = load %workgroup[%bx] : memref<32xf32, 3>
}

// Launch with clusters.
gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
{
// Cluster, block and thread identifiers, as well as cluster/block/grid
// sizes are immediately usable inside body region.
"some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
}
```

Rationale: using operation/block arguments gives analyses a clear way of
Expand All @@ -784,25 +801,35 @@ def GPU_LaunchOp : GPU_Op<"launch", [
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies,
CArg<"TypeRange", "{}">:$workgroupAttributions,
CArg<"TypeRange", "{}">:$privateAttributions)>
CArg<"TypeRange", "{}">:$privateAttributions,
CArg<"Value", "nullptr">:$clusterSizeX,
CArg<"Value", "nullptr">:$clusterSizeY,
CArg<"Value", "nullptr">:$clusterSizeZ)>
];

let extraClassDeclaration = [{
/// Get the SSA values corresponding to kernel block identifiers.
KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();
/// Get the SSA values corresponding to kernel cluster identifiers.
std::optional<KernelDim3> getClusterIds();
/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();
/// Get the SSA values corresponding to kernel cluster size.
std::optional<KernelDim3> getClusterSize();

/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();
/// Get the SSA values passed as operands to specify the cluster size.
std::optional<KernelDim3> getClusterSizeOperandValues();

static StringRef getBlocksKeyword() { return "blocks"; }
static StringRef getClustersKeyword() { return "clusters"; }
static StringRef getThreadsKeyword() { return "threads"; }
static StringRef getDynamicSharedMemorySizeKeyword() {
return "dynamic_shared_memory_size";
Expand All @@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// placed in the leading positions of the argument list.
static constexpr unsigned kNumConfigRegionAttributes = 12;

/// Returns true if cluster size is specified.
bool hasClusterSize() {
if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
return true;
return false;
}
/// Returns the number of operands including cluster size
unsigned getNumConfigOperands() {
return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
}
/// Returns the number of region attributes including cluster size
unsigned getNumConfigRegionAttributes() {
return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
}

/// Returns the keywords used in the custom syntax for this Op.
static StringRef getWorkgroupKeyword() { return "workgroup"; }
static StringRef getPrivateKeyword() { return "private"; }
Expand All @@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// the workgroup memory
ArrayRef<BlockArgument> getWorkgroupAttributions() {
auto begin =
std::next(getBody().args_begin(), kNumConfigRegionAttributes);
std::next(getBody().args_begin(), getNumConfigRegionAttributes());
auto end = std::next(begin, getNumWorkgroupAttributions());
return {begin, end};
}
Expand All @@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [

/// Returns the number of buffers located in the private memory.
unsigned getNumPrivateAttributions() {
return getBody().getNumArguments() - kNumConfigRegionAttributes -
return getBody().getNumArguments() - getNumConfigRegionAttributes() -
getNumWorkgroupAttributions();
}

Expand All @@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// memory.
auto begin =
std::next(getBody().args_begin(),
kNumConfigRegionAttributes + getNumWorkgroupAttributions());
getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
return {begin, getBody().args_end()};
}

Expand All @@ -871,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
let hasCanonicalizer = 1;
let hasCustomAssemblyFormat = 1;
let hasRegionVerifier = 1;
let hasVerifier = 1;
}

def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
Expand Down
88 changes: 79 additions & 9 deletions mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
Value getBlockSizeZ, Value dynamicSharedMemorySize,
Type asyncTokenType, ValueRange asyncDependencies,
TypeRange workgroupAttributions,
TypeRange privateAttributions) {
TypeRange privateAttributions, Value clusterSizeX,
Value clusterSizeY, Value clusterSizeZ) {
// Add a WorkGroup attribution attribute. This attribute is required to
// identify private attributions in the list of block argguments.
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
Expand All @@ -660,6 +661,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
getBlockSizeY, getBlockSizeZ});
if (clusterSizeX)
result.addOperands(clusterSizeX);
if (clusterSizeY)
result.addOperands(clusterSizeY);
if (clusterSizeZ)
result.addOperands(clusterSizeZ);
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);

Expand All @@ -678,9 +685,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
body->addArgument(argTy, result.location);
kernelRegion->push_back(body);
// Fill OperandSegmentSize Attribute.
SmallVector<int32_t, 8> segmentSizes(8, 1);
SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
segmentSizes[7] = clusterSizeX ? 1 : 0;
segmentSizes[8] = clusterSizeY ? 1 : 0;
segmentSizes[9] = clusterSizeZ ? 1 : 0;
result.addAttribute(getOperandSegmentSizeAttr(),
builder.getDenseI32ArrayAttr(segmentSizes));
}
Expand Down Expand Up @@ -709,6 +719,22 @@ KernelDim3 LaunchOp::getBlockSize() {
return KernelDim3{args[9], args[10], args[11]};
}

std::optional<KernelDim3> LaunchOp::getClusterIds() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
if (!hasClusterSize())
return std::nullopt;
auto args = getBody().getArguments();
return KernelDim3{args[12], args[13], args[14]};
}

std::optional<KernelDim3> LaunchOp::getClusterSize() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
if (!hasClusterSize())
return std::nullopt;
auto args = getBody().getArguments();
return KernelDim3{args[15], args[16], args[17]};
}

KernelDim3 LaunchOp::getGridSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
Expand All @@ -719,6 +745,20 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() {
return KernelDim3{operands[3], operands[4], operands[5]};
}

std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
if (!hasClusterSize())
return std::nullopt;
return KernelDim3{operands[6], operands[7], operands[8]};
}

LogicalResult LaunchOp::verify() {
if (!(hasClusterSize()) &&
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
return emitOpError() << "cluster size must be all present";
return success();
}

LogicalResult LaunchOp::verifyRegions() {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
Expand Down Expand Up @@ -778,6 +818,12 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << " [" << getAsyncDependencies() << ']';
}
// Print the launch configuration.
if (hasClusterSize()) {
p << ' ' << getClustersKeyword();
printSizeAssignment(p, getClusterSize().value(),
getClusterSizeOperandValues().value(),
getClusterIds().value());
}
p << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
getBlockIds());
Expand Down Expand Up @@ -831,6 +877,7 @@ parseSizeAssignment(OpAsmParser &parser,

/// Parses a Launch operation.
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional)
/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
/// memory-attribution
Expand All @@ -840,15 +887,13 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
sizes(LaunchOp::kNumConfigOperands);
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);

// Actual (data) operands passed to the kernel.
SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;

// Region arguments to be created.
SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
LaunchOp::kNumConfigRegionAttributes);
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);

// Parse optional async dependencies.
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
Expand All @@ -861,6 +906,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
if (parser.getNumResults() > 0)
result.types.push_back(asyncTokenType);

bool hasCluster = false;
if (succeeded(
parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) {
hasCluster = true;
sizes.resize(9);
regionArgs.resize(18);
}
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);

// Last three segment assigns the cluster size. In the region argument
// list, this is last 6 arguments.
if (hasCluster) {
if (parseSizeAssignment(parser, sizesRef.drop_front(6),
regionArgsRef.slice(15, 3),
regionArgsRef.slice(12, 3)))
return failure();
}
// Parse the size assignment segments: the first segment assigns grid sizes
// and defines values for block identifiers; the second segment assigns block
// sizes and defines values for thread identifiers. In the region argument
Expand Down Expand Up @@ -898,7 +961,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// LaunchOp::getNumWorkgroupAttributionsAttrName().
Type index = parser.getBuilder().getIndexType();
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
LaunchOp::kNumConfigRegionAttributes, index);
LaunchOp::kNumConfigRegionAttributes + 6, index);

SmallVector<OpAsmParser::Argument> regionArguments;
for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
Expand All @@ -916,8 +979,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {

// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
unsigned numWorkgroupAttrs =
regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
unsigned numWorkgroupAttrs = regionArguments.size() -
LaunchOp::kNumConfigRegionAttributes -
(hasCluster ? 6 : 0);
result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(numWorkgroupAttrs));

Expand All @@ -934,8 +998,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
parser.parseOptionalAttrDict(result.attributes))
return failure();

SmallVector<int32_t, 8> segmentSizes(8, 1);
SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();

if (!hasCluster) {
segmentSizes[7] = 0;
segmentSizes[8] = 0;
segmentSizes[9] = 0;
}
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
Expand Down Expand Up @@ -992,7 +1062,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
(*this)->setAttr(attrName,
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
return getBody().insertArgument(
LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
}

/// Adds a new block argument that corresponds to buffers located in
Expand Down
20 changes: 15 additions & 5 deletions mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
/// entry block of `launchOpBody`, to the corresponding result value of the
/// added operations.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
Region &launchOpBody, IRMapping &map) {
Region &launchOpBody, IRMapping &map,
bool hasCluster = false) {
OpBuilder builder(loc->getContext());
Block &firstBlock = launchOpBody.front();
builder.setInsertionPointToStart(&launchFuncOpBody.front());
SmallVector<Value, 12> indexOps;
SmallVector<Value> indexOps;
// The order is important here, as it must match the order of the arguments
createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
if (hasCluster) {
createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
}
// Replace the leading 12 function args with the respective thread/block index
// operations. Iterate backwards since args are erased and indices change.
for (const auto &indexOp : enumerate(indexOps))
Expand Down Expand Up @@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
IRMapping map;

// Map the arguments corresponding to the launch parameters like blockIdx,
// threadIdx, etc.
// threadIdx, etc. If cluster is present, then we also generate clusterIdx and
// clusterDim.
Region &outlinedFuncBody = outlinedFunc.getBody();
injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
launchOp.hasClusterSize());

// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
for (const auto &[launchArg, funcArg] :
Expand Down Expand Up @@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
// The launch op has an optional dynamic shared memory size. If it doesn't
// exist, we use zero.
Value asyncToken = launchOp.getAsyncToken();
std::optional<gpu::KernelDim3> clusterSize =
launchOp.getClusterSizeOperandValues();
auto launchFunc = builder.create<gpu::LaunchFuncOp>(
launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
launchOp.getBlockSizeOperandValues(),
launchOp.getDynamicSharedMemorySize(), operands,
asyncToken ? asyncToken.getType() : nullptr,
launchOp.getAsyncDependencies());
launchOp.getAsyncDependencies(), clusterSize);
launchOp.replaceAllUsesWith(launchFunc);
launchOp.erase();
}
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
// CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index
// CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index

// CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
// CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
// CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
// CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
affine.for %i = 0 to 42 {
// CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
// CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
Expand Down
Loading

0 comments on commit 5b33cff

Please sign in to comment.