Skip to content

Commit

Permalink
Preparation for data-tiled multi_mma codegen (#18532)
Browse files Browse the repository at this point in the history
* Refactor `get*SingleSubgroupLayout`.
* Move the logic to global functions so they can be shared with other
code. This will need to be shared with `DataTiledMMAAttr` and also with
other utility functions dealing with swizzlings.
* Use `MMAFragment` consistently.
* We already have this enum for identifying A/B/C operands in a matmul,
so use that instead of raw integer operandIndex.
* Compute instruction-level swizzles from subgroup layouts.
* The logic in `getIntrinsicSwizzle` was redundant with the existing
subgroup layout descriptions, and was less general. Now it's just
computed.
* Create `TileSwizzle.h` to decouple `TileSwizzle` from
`MaterializeEncodingInfo`.
* This data structure is soon going to be used outside of
`MaterializeEncoding` logic.
* Introduce `GPUTileSwizzleUtils.cpp` to move some code out of
`GPUMaterializeEncoding`.

---------

Signed-off-by: Benoit Jacob <[email protected]>
  • Loading branch information
bjacob committed Sep 17, 2024
1 parent 6fdc30f commit 740e301
Show file tree
Hide file tree
Showing 13 changed files with 360 additions and 253 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ iree_compiler_cc_library(
"PassUtils.h",
"Passes.h",
"TileSizeSelection.h",
"TileSwizzle.h",
"Transforms.h",
"UserConfig.h",
],
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ iree_cc_library(
"PassUtils.h"
"Passes.h"
"TileSizeSelection.h"
"TileSwizzle.h"
"Transforms.h"
"UserConfig.h"
SRCS
Expand Down
30 changes: 3 additions & 27 deletions compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
#define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_

#include "iree/compiler/Codegen/Common/TileSwizzle.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
Expand All @@ -15,41 +16,16 @@
namespace mlir::iree_compiler {

/// Container of information needed to materialize the layout transformations.
///
/// On CPU, these layout transformations consist of a single `temsor.pack`
/// or `tensor.unpack` op, implementing a tiled layout where each tile is
/// row-major.
///
/// On GPU, there is an additional `swizzle`, which changes the layout inside
/// of the tile. See the comment on the nested Swizzle struct.
struct MaterializeEncodingInfo {
// Metadata for a swizzle, that is, an (expand_shape -> transposition)
// pair of ops performing a change of layout within the tiles. This is used
// on GPU, where the tiles themselves can have an arbitrary layout.
struct Swizzle {
// This vector-of-vectors contains all the information needed to generate
// a `tensor.expand_shape` creating additional internal dimensions into the
// tile. For example, expandShape = [[16], [4, 2]] means that the original
// tile shape [16, 8] gets expanded such that the first dimension 16 is left
// unchanged, and the second dimension 8 gets split into two internal dims
// of size 4 and 2.
SmallVector<SmallVector<int64_t>> expandShape;
// This permutation vector applies to the expanded dimensions and is used
// to generate a `linalg.transpose` changing the layout of the tile. For
// example, permutation[0] dictates which of the expanded dimensions becomes
// the leading dimension of the layout.
SmallVector<int64_t> permutation;
};

// The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
// changing the overall layout between row-major and tiled (where each tile is
// row-major).
SmallVector<int64_t> innerDimsPos;
SmallVector<int64_t> innerTileSizes;
SmallVector<int64_t> outerDimsPerm;

// The optional swizzle, see the above comment on Swizzle. Only used on GPU.
std::optional<Swizzle> swizzle;
// The optional swizzle, see the comment on TileSwizzle. Only used on GPU.
std::optional<TileSwizzle> swizzle;
};

using MaterializeEncodingFn = std::function<FailureOr<MaterializeEncodingInfo>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ namespace {
using namespace mlir::iree_compiler::IREE::VectorExt;
using VectorValue = TypedValue<VectorType>;

static LogicalResult isSubgroupLayoutCompatible(
IREE::GPU::MMAAttr::SingleSubgroupLayout subgroupLayout,
NestedLayoutAttr layout, int64_t dim1, int64_t dim2) {
static LogicalResult
isSubgroupLayoutCompatible(IREE::GPU::MMASingleSubgroupLayout subgroupLayout,
NestedLayoutAttr layout, int64_t dim1,
int64_t dim2) {
SmallVector<int64_t> element = {layout.getElementTile()[dim1],
layout.getElementTile()[dim2]};
SmallVector<int64_t> thread = {layout.getThreadTile()[dim1],
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ iree_compiler_cc_library(
"GPUTensorTileToSerialLoops.cpp",
"GPUTile.cpp",
"GPUTileReduction.cpp",
"GPUTileSwizzleUtils.cpp",
"GPUVectorAlloc.cpp",
"GPUVectorDistribution.cpp",
"GPUVerifyDistribution.cpp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ iree_cc_library(
"GPUTensorTileToSerialLoops.cpp"
"GPUTile.cpp"
"GPUTileReduction.cpp"
"GPUTileSwizzleUtils.cpp"
"GPUVectorAlloc.cpp"
"GPUVectorDistribution.cpp"
"GPUVerifyDistribution.cpp"
Expand Down
132 changes: 11 additions & 121 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/Common/EncodingUtils.h"
#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
Expand Down Expand Up @@ -36,118 +37,6 @@ namespace mlir::iree_compiler {
#define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

// Returns the swizzle for a given intrinsic and operand index.
// See the comment on MaterializeEncodingInfo::Swizzle for what that means.
// This function is concerned with a single intrinsic, not a whole kernel tile.
// TODO(bjacob): derive this automatically from the intrinsic layout getters.
static MaterializeEncodingInfo::Swizzle
getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) {
switch (mma) {
case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32:
if (operandIdx == 2) {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
/*permutation=*/{0, 2, 1}};
} else {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}},
/*permutation=*/{1, 0}};
}
case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16:
if (operandIdx == 2) {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
/*permutation=*/{0, 2, 1}};
} else {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}},
/*permutation=*/{1, 0, 2}};
}
case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8:
if (operandIdx == 2) {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
/*permutation=*/{0, 2, 1}};
} else {
return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}},
/*permutation=*/{1, 0, 2}};
}
default:
assert(false && "should not get here.");
return {};
}
}

// Given an `expandShape` vector-of-vectors describing the mapping from source
// dimensions to expanded dimensions, returns the index of the first expanded
// dimension corresponding to the given source dimension index.
static int64_t
getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
int64_t srcIndex) {
int dstIndexFirst = 0;
for (int i = 0; i < srcIndex; ++i) {
dstIndexFirst += expandShape[i].size();
}
return dstIndexFirst;
}

// Unroll the dimension given by `srcIndex` by the given `unrollFactor`.
// This is not interleaving layouts. The layout will consist of multiple copies
// of the input tile, side by side.
//
// Example:
// Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
// Input srcIndex = 1
// Input unrollFactor = 4
// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
//
static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
int unrollFactor) {
assert(unrollFactor > 1);
int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);

// The new unrolling dimension is inserted at the start of the expandShape
// dimensions group corresponding to srcIndex.
swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
unrollFactor);
// Since we are not interleaving here, generating side-by-side copies of the
// original layout, the new unrolling dimension is the new outermost
// dimension. Existing entries get shifted to make room for it.
for (auto &p : swizzle.permutation) {
p += (p >= dstIndexFirst);
}
swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst);
}

// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to
// move permutation[0], the outer-most dimension (which the unroll() function
// created to be the unrolling dimension), to the inner dimension given by
// `expandedDimIndexToInterleaveAt`.
//
// Example:
// Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
// Input srcIndex = 1
// Input expandedDimIndexToInterleaveAt = 1
// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] }
//
static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
int expandedDimIndexToInterleaveAt) {
// Compute which inner dimension to permute the current outer dimension into.
int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt;

SmallVector<int64_t> outPermutation(swizzle.permutation.size());
// The leading dimension, permutation[0], gets moved inwards to the
// position that we just computed, dstIndexToInterleaveAt.
outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0];
// Outer dimensions get shifted outwards to fill the gap.
for (int i = 0; i < dstIndexToInterleaveAt; ++i) {
outPermutation[i] = swizzle.permutation[i + 1];
}
// Inner dimensions don't change. That is to say that we only interleave
// at `targetInterleavedElements` granularity, we don't swizzle further
// internally to that.
for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) {
outPermutation[i] = swizzle.permutation[i];
}
swizzle.permutation = outPermutation;
}

// Returns the index of the dimension whose flattened size (flattening inner
// dimensions into it) matches the given `targetSize`. This is used to compute
// interleaving indices.
Expand All @@ -174,16 +63,16 @@ static int64_t getDimIdxForTargetSize(const SmallVector<int64_t> &shape,

// Generates the swizzle for the full data-tiled-mma tile, including all the
// relevant unrolling factors.
static MaterializeEncodingInfo::Swizzle
getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
IREE::GPU::MMAFragment fragment) {
auto [AType, BType, CType] = mma.getABCElementTypes();
int ABits = AType.getIntOrFloatBitWidth();
int BBits = BType.getIntOrFloatBitWidth();
// TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
const int targetPreferredLoadBitWidth = 128;
auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx);
switch (operandIdx) {
case 0:
auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
switch (fragment) {
case IREE::GPU::MMAFragment::Lhs:
// A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
// Unroll on K with interleaving, then on M.
if (mma.getUnrollK() > 1) {
Expand All @@ -197,7 +86,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
unroll(swizzle, 0, mma.getUnrollM());
}
break;
case 1:
case IREE::GPU::MMAFragment::Rhs:
// B-matrix (RHS). Since the pack ops already took care of transposing B,
// source dimensions are N (index 0) and K (index 1).
// Unroll on K with interleaving, then on N.
Expand All @@ -212,7 +101,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
unroll(swizzle, 0, mma.getUnrollN());
}
break;
case 2:
case IREE::GPU::MMAFragment::Acc:
// C-matrix (accumulator). Source dimensions are M (index 0) and N (index
// 1). Unroll on N, then on M.
if (mma.getUnrollN() > 1) {
Expand Down Expand Up @@ -310,8 +199,9 @@ materializeEncodingForTarget(RankedTensorType tensorType,
TileMxNxK innerTile;
std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
auto operandIdx = encoding.getOperandIndex().getInt();
encodingInfo.swizzle = getSwizzle(*mma, operandIdx);
auto fragment =
static_cast<IREE::GPU::MMAFragment>(encoding.getOperandIndex().getInt());
encodingInfo.swizzle = getSwizzle(*mma, fragment);
return encodingInfo;
}

Expand Down
Loading

0 comments on commit 740e301

Please sign in to comment.