diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
index 38f11a4881e7..186cffbb02f0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -153,6 +153,7 @@ iree_compiler_cc_library(
         "PassUtils.h",
         "Passes.h",
         "TileSizeSelection.h",
+        "TileSwizzle.h",
         "Transforms.h",
         "UserConfig.h",
     ],
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index 7d35eb25bd64..4f3bf1e2afed 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -73,6 +73,7 @@ iree_cc_library(
     "PassUtils.h"
     "Passes.h"
     "TileSizeSelection.h"
+    "TileSwizzle.h"
     "Transforms.h"
     "UserConfig.h"
   SRCS
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index d12a44fe8626..910826c7445b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -7,6 +7,7 @@
 #ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 #define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 
+#include "iree/compiler/Codegen/Common/TileSwizzle.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -15,32 +16,7 @@
 namespace mlir::iree_compiler {
 
 /// Container of information needed to materialize the layout transformations.
-///
-/// On CPU, these layout transformations consist of a single `temsor.pack`
-/// or `tensor.unpack` op, implementing a tiled layout where each tile is
-/// row-major.
-///
-/// On GPU, there is an additional `swizzle`, which changes the layout inside
-/// of the tile. See the comment on the nested Swizzle struct.
 struct MaterializeEncodingInfo {
-  // Metadata for a swizzle, that is, an (expand_shape -> transposition)
-  // pair of ops performing a change of layout within the tiles. This is used
-  // on GPU, where the tiles themselves can have an arbitrary layout.
-  struct Swizzle {
-    // This vector-of-vectors contains all the information needed to generate
-    // a `tensor.expand_shape` creating additional internal dimensions into the
-    // tile. For example, expandShape = [[16], [4, 2]] means that the original
-    // tile shape [16, 8] gets expanded such that the first dimension 16 is left
-    // unchanged, and the second dimension 8 gets split into two internal dims
-    // of size 4 and 2.
-    SmallVector<SmallVector<int64_t>> expandShape;
-    // This permutation vector applies to the expanded dimensions and is used
-    // to generate a `linalg.transpose` changing the layout of the tile. For
-    // example, permutation[0] dictates which of the expanded dimensions becomes
-    // the leading dimension of the layout.
-    SmallVector<int64_t> permutation;
-  };
-
   // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
   // changing the overall layout between row-major and tiled (where each tile is
   // row-major).
@@ -48,8 +24,8 @@ struct MaterializeEncodingInfo {
   SmallVector<int64_t> innerTileSizes;
   SmallVector<int64_t> outerDimsPerm;
 
-  // The optional swizzle, see the above comment on Swizzle. Only used on GPU.
-  std::optional<Swizzle> swizzle;
+  // The optional swizzle, see the comment on TileSwizzle. Only used on GPU.
+  std::optional<TileSwizzle> swizzle;
 };
 
 using MaterializeEncodingFn = std::function<FailureOr<MaterializeEncodingInfo>(
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp
index 785b24d3becc..3e2cb427ed98 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp
@@ -21,9 +21,10 @@ namespace {
 using namespace mlir::iree_compiler::IREE::VectorExt;
 using VectorValue = TypedValue<VectorType>;
 
-static LogicalResult isSubgroupLayoutCompatible(
-    IREE::GPU::MMAAttr::SingleSubgroupLayout subgroupLayout,
-    NestedLayoutAttr layout, int64_t dim1, int64_t dim2) {
+static LogicalResult
+isSubgroupLayoutCompatible(IREE::GPU::MMASingleSubgroupLayout subgroupLayout,
+                           NestedLayoutAttr layout, int64_t dim1,
+                           int64_t dim2) {
   SmallVector<int64_t> element = {layout.getElementTile()[dim1],
                                   layout.getElementTile()[dim2]};
   SmallVector<int64_t> thread = {layout.getThreadTile()[dim1],
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index fdfd56291084..8bdd9982aff2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -75,6 +75,7 @@ iree_compiler_cc_library(
         "GPUTensorTileToSerialLoops.cpp",
         "GPUTile.cpp",
         "GPUTileReduction.cpp",
+        "GPUTileSwizzleUtils.cpp",
         "GPUVectorAlloc.cpp",
         "GPUVectorDistribution.cpp",
         "GPUVerifyDistribution.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index d73a119fda3f..82387adad22c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -73,6 +73,7 @@ iree_cc_library(
     "GPUTensorTileToSerialLoops.cpp"
     "GPUTile.cpp"
     "GPUTileReduction.cpp"
+    "GPUTileSwizzleUtils.cpp"
     "GPUVectorAlloc.cpp"
     "GPUVectorDistribution.cpp"
     "GPUVerifyDistribution.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
index 2d8aa4b67ca6..da4d2812a91d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
+#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
@@ -36,118 +37,6 @@ namespace mlir::iree_compiler {
 #define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
-// Returns the swizzle for a given intrinsic and operand index.
-// See the comment on MaterializeEncodingInfo::Swizzle for what that means.
-// This function is concerned with a single intrinsic, not a whole kernel tile.
-// TODO(bjacob): derive this automatically from the intrinsic layout getters.
-static MaterializeEncodingInfo::Swizzle
-getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) {
-  switch (mma) {
-  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}},
-                                              /*permutation=*/{1, 0}};
-    }
-  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}},
-                                              /*permutation=*/{1, 0, 2}};
-    }
-  case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8:
-    if (operandIdx == 2) {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
-                                              /*permutation=*/{0, 2, 1}};
-    } else {
-      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}},
-                                              /*permutation=*/{1, 0, 2}};
-    }
-  default:
-    assert(false && "should not get here.");
-    return {};
-  }
-}
-
-// Given an `expandShape` vector-of-vectors describing the mapping from source
-// dimensions to expanded dimensions, returns the index of the first expanded
-// dimension corresponding to the given source dimension index.
-static int64_t
-getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
-                       int64_t srcIndex) {
-  int dstIndexFirst = 0;
-  for (int i = 0; i < srcIndex; ++i) {
-    dstIndexFirst += expandShape[i].size();
-  }
-  return dstIndexFirst;
-}
-
-// Unroll the dimension given by `srcIndex` by the given `unrollFactor`.
-// This is not interleaving layouts. The layout will consist of multiple copies
-// of the input tile, side by side.
-//
-// Example:
-//    Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
-//    Input srcIndex = 1
-//    Input unrollFactor = 4
-// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
-//
-static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
-                   int unrollFactor) {
-  assert(unrollFactor > 1);
-  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
-
-  // The new unrolling dimension is inserted at the start of the expandShape
-  // dimensions group corresponding to srcIndex.
-  swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
-                                       unrollFactor);
-  // Since we are not interleaving here, generating side-by-side copies of the
-  // original layout, the new unrolling dimension is the new outermost
-  // dimension. Existing entries get shifted to make room for it.
-  for (auto &p : swizzle.permutation) {
-    p += (p >= dstIndexFirst);
-  }
-  swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst);
-}
-
-// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to
-// move permutation[0], the outer-most dimension (which the unroll() function
-// created to be the unrolling dimension), to the inner dimension given by
-// `expandedDimIndexToInterleaveAt`.
-//
-// Example:
-//    Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
-//    Input srcIndex = 1
-//    Input expandedDimIndexToInterleaveAt = 1
-// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] }
-//
-static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
-                       int expandedDimIndexToInterleaveAt) {
-  // Compute which inner dimension to permute the current outer dimension into.
-  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
-  int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt;
-
-  SmallVector<int64_t> outPermutation(swizzle.permutation.size());
-  // The leading dimension, permutation[0], gets moved inwards to the
-  // position that we just computed, dstIndexToInterleaveAt.
-  outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0];
-  // Outer dimensions get shifted outwards to fill the gap.
-  for (int i = 0; i < dstIndexToInterleaveAt; ++i) {
-    outPermutation[i] = swizzle.permutation[i + 1];
-  }
-  // Inner dimensions don't change. That is to say that we only interleave
-  // at `targetInterleavedElements` granularity, we don't swizzle further
-  // internally to that.
-  for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) {
-    outPermutation[i] = swizzle.permutation[i];
-  }
-  swizzle.permutation = outPermutation;
-}
-
 // Returns the index of the dimension whose flattened size (flattening inner
 // dimensions into it) matches the given `targetSize`. This is used to compute
 // interleaving indices.
@@ -174,16 +63,16 @@ static int64_t getDimIdxForTargetSize(const SmallVector<int64_t> &shape,
 
 // Generates the swizzle for the full data-tiled-mma tile, including all the
 // relevant unrolling factors.
-static MaterializeEncodingInfo::Swizzle
-getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
+static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
+                              IREE::GPU::MMAFragment fragment) {
   auto [AType, BType, CType] = mma.getABCElementTypes();
   int ABits = AType.getIntOrFloatBitWidth();
   int BBits = BType.getIntOrFloatBitWidth();
   // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
   const int targetPreferredLoadBitWidth = 128;
-  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx);
-  switch (operandIdx) {
-  case 0:
+  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
+  switch (fragment) {
+  case IREE::GPU::MMAFragment::Lhs:
     // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
     // Unroll on K with interleaving, then on M.
     if (mma.getUnrollK() > 1) {
@@ -197,7 +86,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
       unroll(swizzle, 0, mma.getUnrollM());
     }
     break;
-  case 1:
+  case IREE::GPU::MMAFragment::Rhs:
     // B-matrix (RHS). Since the pack ops already took care of transposing B,
     // source dimensions are N (index 0) and K (index 1).
     // Unroll on K with interleaving, then on N.
@@ -212,7 +101,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
       unroll(swizzle, 0, mma.getUnrollN());
     }
     break;
-  case 2:
+  case IREE::GPU::MMAFragment::Acc:
     // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
     // 1). Unroll on N, then on M.
     if (mma.getUnrollN() > 1) {
@@ -310,8 +199,9 @@ materializeEncodingForTarget(RankedTensorType tensorType,
   TileMxNxK innerTile;
   std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
   auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
-  auto operandIdx = encoding.getOperandIndex().getInt();
-  encodingInfo.swizzle = getSwizzle(*mma, operandIdx);
+  auto fragment =
+      static_cast<IREE::GPU::MMAFragment>(encoding.getOperandIndex().getInt());
+  encodingInfo.swizzle = getSwizzle(*mma, fragment);
   return encodingInfo;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp
new file mode 100644
index 000000000000..b225e691fcea
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp
@@ -0,0 +1,137 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+
+namespace mlir::iree_compiler {
+
+// Given an `expandShape` vector-of-vectors describing the mapping from source
+// dimensions to expanded dimensions, returns the index of the first expanded
+// dimension corresponding to the given source dimension index.
+static int64_t
+getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
+                       int64_t srcIndex) {
+  int dstIndexFirst = 0;
+  for (int i = 0; i < srcIndex; ++i) {
+    dstIndexFirst += expandShape[i].size();
+  }
+  return dstIndexFirst;
+}
+
+void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor) {
+  assert(unrollFactor > 1);
+  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
+
+  // The new unrolling dimension is inserted at the start of the expandShape
+  // dimensions group corresponding to srcIndex.
+  swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
+                                       unrollFactor);
+  // Since we are not interleaving here, generating side-by-side copies of the
+  // original layout, the new unrolling dimension is the new outermost
+  // dimension. Existing entries get shifted to make room for it.
+  for (auto &p : swizzle.permutation) {
+    p += (p >= dstIndexFirst);
+  }
+  swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst);
+}
+
+void interleave(TileSwizzle &swizzle, int srcIndex,
+                int expandedDimIndexToInterleaveAt) {
+  // Compute which inner dimension to permute the current outer dimension into.
+  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
+  int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt;
+
+  SmallVector<int64_t> outPermutation(swizzle.permutation.size());
+  // The leading dimension, permutation[0], gets moved inwards to the
+  // position that we just computed, dstIndexToInterleaveAt.
+  outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0];
+  // Outer dimensions get shifted outwards to fill the gap.
+  for (int i = 0; i < dstIndexToInterleaveAt; ++i) {
+    outPermutation[i] = swizzle.permutation[i + 1];
+  }
+  // Inner dimensions don't change. That is to say that we only interleave
+  // at `targetInterleavedElements` granularity, we don't swizzle further
+  // internally to that.
+  for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) {
+    outPermutation[i] = swizzle.permutation[i];
+  }
+  swizzle.permutation = outPermutation;
+}
+
+// Returns the permutation of indices that sorts `v` with the given comparator.
+template <template <typename U> class Comparator, typename T>
+static SmallVector<int64_t> getSortingPermutation(ArrayRef<T> v) {
+  using P = std::pair<int64_t, T>;
+  SmallVector<P> pairs;
+  pairs.reserve(v.size());
+  for (auto [i, x] : llvm::enumerate(v)) {
+    pairs.push_back({i, x});
+  }
+  std::sort(pairs.begin(), pairs.end(),
+            [](P p1, P p2) { return Comparator<T>{}(p1.second, p2.second); });
+  SmallVector<int64_t> indices;
+  for (auto p : pairs) {
+    indices.push_back(p.first);
+  }
+  return indices;
+}
+
+TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
+                                IREE::GPU::MMAFragment fragment) {
+  auto layout = IREE::GPU::getSingleSubgroupLayout(intrinsic, fragment);
+
+  // MMASingleSubgroupLayout has non-transposed RHS.
+  // TileSwizzle has transposed RHS.
+  if (fragment == IREE::GPU::MMAFragment::Rhs) {
+    std::swap(layout.outer[0], layout.outer[1]);
+    std::swap(layout.thread[0], layout.thread[1]);
+    std::swap(layout.tstrides[0], layout.tstrides[1]);
+    std::swap(layout.element[0], layout.element[1]);
+  }
+
+  // Initially populate swizzle.expandShape with just the thread sizes, no
+  // shape expansion for now.
+  TileSwizzle swizzle;
+  for (auto t : layout.thread) {
+    swizzle.expandShape.push_back({t});
+  }
+  // The layout strides decide the initial swizzle.permutation.
+  // Some WMMA intrinsics have tstrides=0 values, assert on that as that
+  // would defeat this algorithm. We'll need to solve that if and when we want
+  // to support data tiling on WMMA intrinsics.
+  for (auto s : layout.tstrides) {
+    (void)s;
+    assert(s != 0);
+  }
+  swizzle.permutation =
+      getSortingPermutation<std::greater, int64_t>(layout.tstrides);
+  // Deal with any element size greater than 1 by inserting it innermost.
+  // Notice that this is similar to the unroll() function, just creating an
+  // inner dimension instead of an outer dimension.
+  for (int i = 0; i < layout.element.size(); ++i) {
+    if (layout.element[i] != 1) {
+      swizzle.expandShape[i].push_back(layout.element[i]);
+      int newIndex = getExpandedDimFirstIdx(swizzle.expandShape, i + 1) - 1;
+      for (auto &p : swizzle.permutation) {
+        p += (p >= newIndex);
+      }
+      swizzle.permutation.push_back(newIndex);
+    }
+  }
+  // Deal with any outer size greater than 1 as just a call to unroll.
+  // Iterate over dims in reverse order because we are creating a new outermost
+  // dimension each time.
+  for (int i = layout.outer.size() - 1; i >= 0; --i) {
+    if (layout.outer[i] != 1) {
+      unroll(swizzle, i, layout.outer[i]);
+    }
+  }
+
+  return swizzle;
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h
new file mode 100644
index 000000000000..fc5af79c9485
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h
@@ -0,0 +1,48 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_GPU_GPUTILESWIZZLEUTILS_H_
+#define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_GPU_GPUTILESWIZZLEUTILS_H_
+
+#include "iree/compiler/Codegen/Common/TileSwizzle.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+
+namespace mlir::iree_compiler {
+
+// Returns the TileSwizzle bringing a tile from row-major layout into the tiled
+// layout consumed by the given `intrinsic` and `fragment`.
+TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
+                                IREE::GPU::MMAFragment fragment);
+
+// Unrolls the dimension given by `srcIndex` by the given `unrollFactor`.
+// This is not interleaving layouts. The layout will consist of multiple copies
+// of the input tile, side by side.
+//
+// Example:
+//    Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
+//    Input srcIndex = 1
+//    Input unrollFactor = 4
+// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
+//
+void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor);
+
+// Interleaves the layout in `swizzle` by mutating `swizzle.permutation` to
+// move permutation[0], the outer-most dimension (which the unroll() function
+// created to be the unrolling dimension), to the inner dimension given by
+// `expandedDimIndexToInterleaveAt`.
+//
+// Example:
+//    Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
+//    Input srcIndex = 1
+//    Input expandedDimIndexToInterleaveAt = 1
+// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] }
+//
+void interleave(TileSwizzle &swizzle, int srcIndex,
+                int expandedDimIndexToInterleaveAt);
+
+} // namespace mlir::iree_compiler
+
+#endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_GPU_GPUTILESWIZZLEUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileSwizzle.h b/compiler/src/iree/compiler/Codegen/Common/TileSwizzle.h
new file mode 100644
index 000000000000..b908ae43fac3
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/TileSwizzle.h
@@ -0,0 +1,35 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_TILESWIZZLE_H_
+#define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_TILESWIZZLE_H_
+
+#include <cstdint>
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir::iree_compiler {
+
+// Metadata for a swizzle, that is, an (expand_shape -> transposition)
+// pair of ops performing a change of layout within the tiles. This is used
+// on GPU, where the tiles themselves can have an arbitrary layout.
+struct TileSwizzle {
+  // This vector-of-vectors contains all the information needed to generate
+  // a `tensor.expand_shape` creating additional internal dimensions into the
+  // tile. For example, expandShape = [[16], [4, 2]] means that the original
+  // tile shape [16, 8] gets expanded such that the first dimension 16 is left
+  // unchanged, and the second dimension 8 gets split into two internal dims
+  // of size 4 and 2.
+  llvm::SmallVector<llvm::SmallVector<int64_t>> expandShape;
+  // This permutation vector applies to the expanded dimensions and is used
+  // to generate a `linalg.transpose` changing the layout of the tile. For
+  // example, permutation[0] dictates which of the expanded dimensions becomes
+  // the leading dimension of the layout.
+  llvm::SmallVector<int64_t> permutation;
+};
+
+} // namespace mlir::iree_compiler
+
+#endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_TILESWIZZLE_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 11e8a3d39826..7b7afd1670a9 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -550,103 +550,111 @@ int64_t MMAAttr::getSubgroupSize() const {
   return getIntrinsicSubgroupSize(getIntrinsic().getValue());
 }
 
-MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const {
-  switch (getIntrinsic().getValue()) {
-  case MMAIntrinsic::MFMA_F32_16x16x4_F32: {
-    return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16},
-            /*element=*/{1, 1}};
-  }
+MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
+                                                MMAFragment fragment) {
+  switch (intrinsic) {
+  case MMAIntrinsic::MFMA_F32_16x16x4_F32:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*tstrides=*/{1, 16},
+              /*element=*/{1, 1}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{1, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{4, 1}};
+    }
   case MMAIntrinsic::MFMA_I32_16x16x16_I8:
-  case MMAIntrinsic::MFMA_F32_16x16x16_F16: {
-    return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16},
-            /*element=*/{1, 4}};
-  }
+  case MMAIntrinsic::MFMA_F32_16x16x16_F16:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*tstrides=*/{1, 16},
+              /*element=*/{1, 4}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{4, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{4, 1}};
+    }
   case MMAIntrinsic::MFMA_I32_32x32x8_I8:
-  case MMAIntrinsic::MFMA_F32_32x32x8_F16: {
-    return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32},
-            /*element=*/{1, 4}};
-  }
+  case MMAIntrinsic::MFMA_F32_32x32x8_F16:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*tstrides=*/{1, 32},
+              /*element=*/{1, 4}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
+              /*element=*/{4, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
+              /*element=*/{4, 1}};
+    }
   case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
-  case MMAIntrinsic::MFMA_I32_16x16x32_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16},
-            /*element=*/{1, 8}};
-  }
-  case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32},
-            /*element=*/{1, 8}};
-  }
+  case MMAIntrinsic::MFMA_I32_16x16x32_I8:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*tstrides=*/{1, 16},
+              /*element=*/{1, 8}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{8, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{4, 1}};
+    }
+  case MMAIntrinsic::MFMA_I32_32x32x16_I8:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*tstrides=*/{1, 32},
+              /*element=*/{1, 8}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
+              /*element=*/{8, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
+              /*element=*/{4, 1}};
+    }
   case MMAIntrinsic::WMMA_F32_16x16x16_F16:
+  case MMAIntrinsic::WMMA_I32_16x16x16_I8:
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 0},
+              /*element=*/{1, 16}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*tstrides=*/{0, 1},
+              /*element=*/{16, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{1, 1}};
+    }
   case MMAIntrinsic::WMMA_F16_16x16x16_F16:
-  case MMAIntrinsic::WMMA_I32_16x16x16_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 0},
-            /*element=*/{1, 16}};
-  }
+    switch (fragment) {
+    case MMAFragment::Lhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 0},
+              /*element=*/{1, 16}};
+    case MMAFragment::Rhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*tstrides=*/{0, 1},
+              /*element=*/{16, 1}};
+    case MMAFragment::Acc:
+      return {/*outer=*/{16, 1}, /*thread=*/{1, 16}, /*tstrides=*/{0, 1},
+              /*element=*/{1, 1}};
+    }
   }
   return {};
 }
 
-MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const {
-  switch (getIntrinsic().getValue()) {
-  case MMAIntrinsic::MFMA_F32_16x16x4_F32: {
-    return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1},
-            /*element=*/{1, 1}};
-  }
-  case MMAIntrinsic::MFMA_I32_16x16x16_I8:
-  case MMAIntrinsic::MFMA_F32_16x16x16_F16: {
-    return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1},
-            /*element=*/{4, 1}};
-  }
-  case MMAIntrinsic::MFMA_I32_32x32x8_I8:
-  case MMAIntrinsic::MFMA_F32_32x32x8_F16: {
-    return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
-            /*element=*/{4, 1}};
-  }
-  case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
-  case MMAIntrinsic::MFMA_I32_16x16x32_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1},
-            /*element=*/{8, 1}};
-  }
-  case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
-            /*element=*/{8, 1}};
-  }
-  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
-  case MMAIntrinsic::WMMA_F16_16x16x16_F16:
-  case MMAIntrinsic::WMMA_I32_16x16x16_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{0, 1},
-            /*element=*/{16, 1}};
-  }
-  }
-  return {};
+MMASingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const {
+  return getSingleSubgroupLayout(getIntrinsic().getValue(), MMAFragment::Lhs);
 }
 
-MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const {
-  switch (getIntrinsic().getValue()) {
-  case MMAIntrinsic::MFMA_F32_16x16x4_F32:
-  case MMAIntrinsic::MFMA_I32_16x16x16_I8:
-  case MMAIntrinsic::MFMA_F32_16x16x16_F16:
-  case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
-  case MMAIntrinsic::MFMA_I32_16x16x32_I8: {
-    return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1},
-            /*element=*/{4, 1}};
-  }
-  case MMAIntrinsic::MFMA_I32_32x32x8_I8:
-  case MMAIntrinsic::MFMA_F32_32x32x8_F16:
-  case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
-    return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
-            /*element=*/{4, 1}};
-  }
-  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
-  case MMAIntrinsic::WMMA_I32_16x16x16_I8: {
-    return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*strides=*/{16, 1},
-            /*element=*/{1, 1}};
-  }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
-    return {/*outer=*/{16, 1}, /*thread=*/{1, 16}, /*strides=*/{0, 1},
-            /*element=*/{1, 1}};
-  }
-  }
-  return {};
+MMASingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const {
+  return getSingleSubgroupLayout(getIntrinsic().getValue(), MMAFragment::Rhs);
+}
+
+MMASingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const {
+  return getSingleSubgroupLayout(getIntrinsic().getValue(), MMAFragment::Acc);
 }
 
 // Generates amdgpu.mfma/wmma operation on the given inputs for this attribute
@@ -701,11 +709,12 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
 
 static LogicalResult populateCanonicalOffsetsSizesAndStrides(
     OpBuilder &builder, Location loc, Value laneId,
-    ArrayRef<int64_t> permutation, MMAAttr::SingleSubgroupLayout subgroupLayout,
+    ArrayRef<int64_t> permutation, MMASingleSubgroupLayout subgroupLayout,
     SmallVector<OpFoldResult> &canonicalOffsets,
     SmallVector<OpFoldResult> &canonicalSizes,
     SmallVector<OpFoldResult> &canonicalStrides) {
   SmallVector<int64_t> rankReducedShape;
+
   for (auto [outer, thread, element] :
        llvm::zip_equal(subgroupLayout.outer, subgroupLayout.thread,
                        subgroupLayout.element)) {
@@ -767,7 +776,7 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides(
     SmallVector<OpFoldResult> &offsets, SmallVector<OpFoldResult> &sizes,
     SmallVector<OpFoldResult> &strides) const {
 
-  MMAAttr::SingleSubgroupLayout subgroupLayout;
+  MMASingleSubgroupLayout subgroupLayout;
   switch (fragment) {
   case IREE::GPU::MMAFragment::Lhs: {
     subgroupLayout = getASingleSubgroupLayout();
@@ -984,7 +993,7 @@ NestedLayoutAttr createNestedLayout(MLIRContext *context, int64_t rank,
                                     SmallVector<int64_t> subgroupSizes,
                                     SmallVector<int64_t> subgroupStrides,
                                     SmallVector<int64_t> batchCount,
-                                    MMAAttr::SingleSubgroupLayout counts) {
+                                    MMASingleSubgroupLayout counts) {
 
   LLVM_DEBUG({
     llvm::errs() << "Creating Nested Layout for::";
@@ -1052,7 +1061,7 @@ MMAScheduleAttr::getContractionLayout(VectorContractOpInfo &opInfo,
   }
 
   // Get the concrete nested layout for each matrix. Note that the struct
-  // MMAAttr::SingleSubgroupLayout contains the partial layout for the
+  // MMASingleSubgroupLayout contains the partial layout for the
   // canonical (M, K) x (K, N) -> (M, N) matmul form; while the specific
   // contract op we are looking at right now may not be exactly in that form.
   // So here we need to permute/transpose the canonical layout to match with
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
index eb1b24ef9552..720c1bc088a1 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
@@ -20,6 +20,22 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 
+namespace mlir::iree_compiler::IREE::GPU {
+
+// Partial nested layout for an MMA intrinsic's matrix input/output inside
+// a single subgroup.
+struct MMASingleSubgroupLayout {
+  llvm::SmallVector<int64_t, 2> outer;
+  llvm::SmallVector<int64_t, 2> thread;
+  llvm::SmallVector<int64_t, 2> tstrides;
+  llvm::SmallVector<int64_t, 2> element;
+};
+
+MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
+                                                MMAFragment fragment);
+
+} // namespace mlir::iree_compiler::IREE::GPU
+
 // clang-format off
 #define GET_ATTRDEF_CLASSES
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h.inc"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index 56ae368b14c6..1e25185e00f0 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -201,22 +201,13 @@ def IREEGPU_MMAAttr : IREEGPU_MmaVectorLayoutAttr<"MMA", "MMAIntrinsicAttr"> {
   let extraClassDeclaration = [{
     int64_t getBlockSize() const;
 
-    // Partial nested layout for an MMA intrinsic's matrix input/output inside
-    // a single subgroup.
-    struct SingleSubgroupLayout {
-      SmallVector<int64_t, 2> outer;
-      SmallVector<int64_t, 2> thread;
-      SmallVector<int64_t, 2> tstrides;
-      SmallVector<int64_t, 2> element;
-    };
-
     // Returns the A/B/C matrix's partial nested layout shape inside a single
     // subgroup. Shape at each outer/thread/element level is a 2-D value,
     // following canonical matmul order--(M, K) for A, (K, N) for B, and
     // (M, N) for C.
-    SingleSubgroupLayout getASingleSubgroupLayout() const;
-    SingleSubgroupLayout getBSingleSubgroupLayout() const;
-    SingleSubgroupLayout getCSingleSubgroupLayout() const;
+    MMASingleSubgroupLayout getASingleSubgroupLayout() const;
+    MMASingleSubgroupLayout getBSingleSubgroupLayout() const;
+    MMASingleSubgroupLayout getCSingleSubgroupLayout() const;
   }];
 }