Preparation for data-tiled multi_mma codegen (#18532)

* Refactor `get*SingleSubgroupLayout`. * Move the logic to global functions so they can be shared with other code. This will need to be shared with `DataTiledMMAAttr` and also with other utility functions dealing with swizzlings. * Use `MMAFragment` consistently. * We already have this enum for identifying A/B/C operands in a matmul, so use that instead of raw integer operandIndex. * Compute instruction-level swizzles from subgroup layouts. * The logic in `getIntrinsicSwizzle` was redundant with the existing subgroup layout descriptions, and was less general. Now it's just computed. * Create `TileSwizzle.h` to decouple `TileSwizzle` from `MaterializeEncodingInfo`. * This data structure is soon going to be used outside of `MaterializeEncoding` logic. * Introduce `GPUTileSwizzleUtils.cpp` to move some code out of `GPUMaterializeEncoding`. --------- Signed-off-by: Benoit Jacob <[email protected]>
iree-org · Sep 17, 2024 · 740e301 · 740e301
1 parent 6fdc30f
commit 740e301
Show file tree

Hide file tree

Showing 13 changed files with 360 additions and 253 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -153,6 +153,7 @@ iree_compiler_cc_library(
         "PassUtils.h",
         "Passes.h",
         "TileSizeSelection.h",
+        "TileSwizzle.h",
         "Transforms.h",
         "UserConfig.h",
     ],

diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -73,6 +73,7 @@ iree_cc_library(
     "PassUtils.h"
     "Passes.h"
     "TileSizeSelection.h"
+    "TileSwizzle.h"
     "Transforms.h"
     "UserConfig.h"
   SRCS

diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -7,6 +7,7 @@
 #ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 #define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 
+#include "iree/compiler/Codegen/Common/TileSwizzle.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -15,41 +16,16 @@
 namespace mlir::iree_compiler {
 
 /// Container of information needed to materialize the layout transformations.
-///
-/// On CPU, these layout transformations consist of a single `temsor.pack`
-/// or `tensor.unpack` op, implementing a tiled layout where each tile is
-/// row-major.
-///
-/// On GPU, there is an additional `swizzle`, which changes the layout inside
-/// of the tile. See the comment on the nested Swizzle struct.
 struct MaterializeEncodingInfo {
-  // Metadata for a swizzle, that is, an (expand_shape -> transposition)
-  // pair of ops performing a change of layout within the tiles. This is used
-  // on GPU, where the tiles themselves can have an arbitrary layout.
-  struct Swizzle {
-    // This vector-of-vectors contains all the information needed to generate
-    // a `tensor.expand_shape` creating additional internal dimensions into the
-    // tile. For example, expandShape = [[16], [4, 2]] means that the original
-    // tile shape [16, 8] gets expanded such that the first dimension 16 is left
-    // unchanged, and the second dimension 8 gets split into two internal dims
-    // of size 4 and 2.
-    SmallVector<SmallVector<int64_t>> expandShape;
-    // This permutation vector applies to the expanded dimensions and is used
-    // to generate a `linalg.transpose` changing the layout of the tile. For
-    // example, permutation[0] dictates which of the expanded dimensions becomes
-    // the leading dimension of the layout.
-    SmallVector<int64_t> permutation;
-  };
-
   // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
   // changing the overall layout between row-major and tiled (where each tile is
   // row-major).
   SmallVector<int64_t> innerDimsPos;
   SmallVector<int64_t> innerTileSizes;
   SmallVector<int64_t> outerDimsPerm;
 
-  // The optional swizzle, see the above comment on Swizzle. Only used on GPU.
-  std::optional<Swizzle> swizzle;
+  // The optional swizzle, see the comment on TileSwizzle. Only used on GPU.
+  std::optional<TileSwizzle> swizzle;
 };
 
 using MaterializeEncodingFn = std::function<FailureOr<MaterializeEncodingInfo>(

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp
@@ -21,9 +21,10 @@ namespace {
 using namespace mlir::iree_compiler::IREE::VectorExt;
 using VectorValue = TypedValue<VectorType>;
 
-static LogicalResult isSubgroupLayoutCompatible(
-    IREE::GPU::MMAAttr::SingleSubgroupLayout subgroupLayout,
-    NestedLayoutAttr layout, int64_t dim1, int64_t dim2) {
+static LogicalResult
+isSubgroupLayoutCompatible(IREE::GPU::MMASingleSubgroupLayout subgroupLayout,
+                           NestedLayoutAttr layout, int64_t dim1,
+                           int64_t dim2) {
   SmallVector<int64_t> element = {layout.getElementTile()[dim1],
                                   layout.getElementTile()[dim2]};
   SmallVector<int64_t> thread = {layout.getThreadTile()[dim1],

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -75,6 +75,7 @@ iree_compiler_cc_library(
         "GPUTensorTileToSerialLoops.cpp",
         "GPUTile.cpp",
         "GPUTileReduction.cpp",
+        "GPUTileSwizzleUtils.cpp",
         "GPUVectorAlloc.cpp",
         "GPUVectorDistribution.cpp",
         "GPUVerifyDistribution.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -73,6 +73,7 @@ iree_cc_library(
     "GPUTensorTileToSerialLoops.cpp"
     "GPUTile.cpp"
     "GPUTileReduction.cpp"
+    "GPUTileSwizzleUtils.cpp"
     "GPUVectorAlloc.cpp"
     "GPUVectorDistribution.cpp"
     "GPUVerifyDistribution.cpp"

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
+#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
@@ -36,118 +37,6 @@ namespace mlir::iree_compiler {
 #define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
-// Returns the swizzle for a given intrinsic and operand index.
-// See the comment on MaterializeEncodingInfo::Swizzle for what that means.
-// This function is concerned with a single intrinsic, not a whole kernel tile.
-// TODO(bjacob): derive this automatically from the intrinsic layout getters.
-static MaterializeEncodingInfo::Swizzle
-getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) {
-  switch (mma) {
-  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}},
-                                              /*permutation=*/{1, 0}};
-    }
-  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}},
-                                              /*permutation=*/{1, 0, 2}};
-    }
-  case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}},
-                                              /*permutation=*/{1, 0, 2}};
-    }
-  default:
-    assert(false && "should not get here.");
-    return {};
-  }
-}
-
-// Given an `expandShape` vector-of-vectors describing the mapping from source
-// dimensions to expanded dimensions, returns the index of the first expanded
-// dimension corresponding to the given source dimension index.
-static int64_t
-getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
-                       int64_t srcIndex) {
-  int dstIndexFirst = 0;
-  for (int i = 0; i < srcIndex; ++i) {
-    dstIndexFirst += expandShape[i].size();
-  }
-  return dstIndexFirst;
-}
-
-// Unroll the dimension given by `srcIndex` by the given `unrollFactor`.
-// This is not interleaving layouts. The layout will consist of multiple copies
-// of the input tile, side by side.
-//
-// Example:
-//    Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
-//    Input srcIndex = 1
-//    Input unrollFactor = 4
-// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
-//
-static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
-                   int unrollFactor) {
-  assert(unrollFactor > 1);
-  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
-
-  // The new unrolling dimension is inserted at the start of the expandShape
-  // dimensions group corresponding to srcIndex.
-  swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
-                                       unrollFactor);
-  // Since we are not interleaving here, generating side-by-side copies of the
-  // original layout, the new unrolling dimension is the new outermost
-  // dimension. Existing entries get shifted to make room for it.
-  for (auto &p : swizzle.permutation) {
-    p += (p >= dstIndexFirst);
-  }
-  swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst);
-}
-
-// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to
-// move permutation[0], the outer-most dimension (which the unroll() function
-// created to be the unrolling dimension), to the inner dimension given by
-// `expandedDimIndexToInterleaveAt`.
-//
-// Example:
-//    Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
-//    Input srcIndex = 1
-//    Input expandedDimIndexToInterleaveAt = 1
-// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] }
-//
-static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
-                       int expandedDimIndexToInterleaveAt) {
-  // Compute which inner dimension to permute the current outer dimension into.
-  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
-  int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt;
-
-  SmallVector<int64_t> outPermutation(swizzle.permutation.size());
-  // The leading dimension, permutation[0], gets moved inwards to the
-  // position that we just computed, dstIndexToInterleaveAt.
-  outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0];
-  // Outer dimensions get shifted outwards to fill the gap.
-  for (int i = 0; i < dstIndexToInterleaveAt; ++i) {
-    outPermutation[i] = swizzle.permutation[i + 1];
-  }
-  // Inner dimensions don't change. That is to say that we only interleave
-  // at `targetInterleavedElements` granularity, we don't swizzle further
-  // internally to that.
-  for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) {
-    outPermutation[i] = swizzle.permutation[i];
-  }
-  swizzle.permutation = outPermutation;
-}
-
 // Returns the index of the dimension whose flattened size (flattening inner
 // dimensions into it) matches the given `targetSize`. This is used to compute
 // interleaving indices.
@@ -174,16 +63,16 @@ static int64_t getDimIdxForTargetSize(const SmallVector<int64_t> &shape,
 
 // Generates the swizzle for the full data-tiled-mma tile, including all the
 // relevant unrolling factors.
-static MaterializeEncodingInfo::Swizzle
-getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
+static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
+                              IREE::GPU::MMAFragment fragment) {
   auto [AType, BType, CType] = mma.getABCElementTypes();
   int ABits = AType.getIntOrFloatBitWidth();
   int BBits = BType.getIntOrFloatBitWidth();
   // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
   const int targetPreferredLoadBitWidth = 128;
-  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx);
-  switch (operandIdx) {
-  case 0:
+  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
+  switch (fragment) {
+  case IREE::GPU::MMAFragment::Lhs:
     // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
     // Unroll on K with interleaving, then on M.
     if (mma.getUnrollK() > 1) {
@@ -197,7 +86,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
       unroll(swizzle, 0, mma.getUnrollM());
     }
     break;
-  case 1:
+  case IREE::GPU::MMAFragment::Rhs:
     // B-matrix (RHS). Since the pack ops already took care of transposing B,
     // source dimensions are N (index 0) and K (index 1).
     // Unroll on K with interleaving, then on N.
@@ -212,7 +101,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
       unroll(swizzle, 0, mma.getUnrollN());
     }
     break;
-  case 2:
+  case IREE::GPU::MMAFragment::Acc:
     // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
     // 1). Unroll on N, then on M.
     if (mma.getUnrollN() > 1) {
@@ -310,8 +199,9 @@ materializeEncodingForTarget(RankedTensorType tensorType,
   TileMxNxK innerTile;
   std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
   auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
-  auto operandIdx = encoding.getOperandIndex().getInt();
-  encodingInfo.swizzle = getSwizzle(*mma, operandIdx);
+  auto fragment =
+      static_cast<IREE::GPU::MMAFragment>(encoding.getOperandIndex().getInt());
+  encodingInfo.swizzle = getSwizzle(*mma, fragment);
   return encodingInfo;
 }