diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel index 38f11a4881e7..186cffbb02f0 100644 --- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel @@ -153,6 +153,7 @@ iree_compiler_cc_library( "PassUtils.h", "Passes.h", "TileSizeSelection.h", + "TileSwizzle.h", "Transforms.h", "UserConfig.h", ], diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt index 7d35eb25bd64..4f3bf1e2afed 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt @@ -73,6 +73,7 @@ iree_cc_library( "PassUtils.h" "Passes.h" "TileSizeSelection.h" + "TileSwizzle.h" "Transforms.h" "UserConfig.h" SRCS diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h index d12a44fe8626..910826c7445b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h @@ -7,6 +7,7 @@ #ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_ #define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_ +#include "iree/compiler/Codegen/Common/TileSwizzle.h" #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -15,32 +16,7 @@ namespace mlir::iree_compiler { /// Container of information needed to materialize the layout transformations. -/// -/// On CPU, these layout transformations consist of a single `temsor.pack` -/// or `tensor.unpack` op, implementing a tiled layout where each tile is -/// row-major. -/// -/// On GPU, there is an additional `swizzle`, which changes the layout inside -/// of the tile. See the comment on the nested Swizzle struct. struct MaterializeEncodingInfo { - // Metadata for a swizzle, that is, an (expand_shape -> transposition) - // pair of ops performing a change of layout within the tiles. This is used - // on GPU, where the tiles themselves can have an arbitrary layout. - struct Swizzle { - // This vector-of-vectors contains all the information needed to generate - // a `tensor.expand_shape` creating additional internal dimensions into the - // tile. For example, expandShape = [[16], [4, 2]] means that the original - // tile shape [16, 8] gets expanded such that the first dimension 16 is left - // unchanged, and the second dimension 8 gets split into two internal dims - // of size 4 and 2. - SmallVector> expandShape; - // This permutation vector applies to the expanded dimensions and is used - // to generate a `linalg.transpose` changing the layout of the tile. For - // example, permutation[0] dictates which of the expanded dimensions becomes - // the leading dimension of the layout. - SmallVector permutation; - }; - // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op, // changing the overall layout between row-major and tiled (where each tile is // row-major). @@ -48,8 +24,8 @@ struct MaterializeEncodingInfo { SmallVector innerTileSizes; SmallVector outerDimsPerm; - // The optional swizzle, see the above comment on Swizzle. Only used on GPU. - std::optional swizzle; + // The optional swizzle, see the comment on TileSwizzle. Only used on GPU. + std::optional swizzle; }; using MaterializeEncodingFn = std::function( diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp index 785b24d3becc..3e2cb427ed98 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp @@ -21,9 +21,10 @@ namespace { using namespace mlir::iree_compiler::IREE::VectorExt; using VectorValue = TypedValue; -static LogicalResult isSubgroupLayoutCompatible( - IREE::GPU::MMAAttr::SingleSubgroupLayout subgroupLayout, - NestedLayoutAttr layout, int64_t dim1, int64_t dim2) { +static LogicalResult +isSubgroupLayoutCompatible(IREE::GPU::MMASingleSubgroupLayout subgroupLayout, + NestedLayoutAttr layout, int64_t dim1, + int64_t dim2) { SmallVector element = {layout.getElementTile()[dim1], layout.getElementTile()[dim2]}; SmallVector thread = {layout.getThreadTile()[dim1], diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel index fdfd56291084..8bdd9982aff2 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel @@ -75,6 +75,7 @@ iree_compiler_cc_library( "GPUTensorTileToSerialLoops.cpp", "GPUTile.cpp", "GPUTileReduction.cpp", + "GPUTileSwizzleUtils.cpp", "GPUVectorAlloc.cpp", "GPUVectorDistribution.cpp", "GPUVerifyDistribution.cpp", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt index d73a119fda3f..82387adad22c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt @@ -73,6 +73,7 @@ iree_cc_library( "GPUTensorTileToSerialLoops.cpp" "GPUTile.cpp" "GPUTileReduction.cpp" + "GPUTileSwizzleUtils.cpp" "GPUVectorAlloc.cpp" "GPUVectorDistribution.cpp" "GPUVerifyDistribution.cpp" diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp index 2d8aa4b67ca6..da4d2812a91d 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree/compiler/Codegen/Common/EncodingUtils.h" +#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h" #include "iree/compiler/Codegen/Common/GPU/Passes.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" @@ -36,118 +37,6 @@ namespace mlir::iree_compiler { #define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc" -// Returns the swizzle for a given intrinsic and operand index. -// See the comment on MaterializeEncodingInfo::Swizzle for what that means. -// This function is concerned with a single intrinsic, not a whole kernel tile. -// TODO(bjacob): derive this automatically from the intrinsic layout getters. -static MaterializeEncodingInfo::Swizzle -getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) { - switch (mma) { - case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32: - if (operandIdx == 2) { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, - /*permutation=*/{0, 2, 1}}; - } else { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}}, - /*permutation=*/{1, 0}}; - } - case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16: - if (operandIdx == 2) { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, - /*permutation=*/{0, 2, 1}}; - } else { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}}, - /*permutation=*/{1, 0, 2}}; - } - case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8: - if (operandIdx == 2) { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, - /*permutation=*/{0, 2, 1}}; - } else { - return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}}, - /*permutation=*/{1, 0, 2}}; - } - default: - assert(false && "should not get here."); - return {}; - } -} - -// Given an `expandShape` vector-of-vectors describing the mapping from source -// dimensions to expanded dimensions, returns the index of the first expanded -// dimension corresponding to the given source dimension index. -static int64_t -getExpandedDimFirstIdx(const SmallVector> &expandShape, - int64_t srcIndex) { - int dstIndexFirst = 0; - for (int i = 0; i < srcIndex; ++i) { - dstIndexFirst += expandShape[i].size(); - } - return dstIndexFirst; -} - -// Unroll the dimension given by `srcIndex` by the given `unrollFactor`. -// This is not interleaving layouts. The layout will consist of multiple copies -// of the input tile, side by side. -// -// Example: -// Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] } -// Input srcIndex = 1 -// Input unrollFactor = 4 -// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] } -// -static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex, - int unrollFactor) { - assert(unrollFactor > 1); - int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); - - // The new unrolling dimension is inserted at the start of the expandShape - // dimensions group corresponding to srcIndex. - swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(), - unrollFactor); - // Since we are not interleaving here, generating side-by-side copies of the - // original layout, the new unrolling dimension is the new outermost - // dimension. Existing entries get shifted to make room for it. - for (auto &p : swizzle.permutation) { - p += (p >= dstIndexFirst); - } - swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst); -} - -// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to -// move permutation[0], the outer-most dimension (which the unroll() function -// created to be the unrolling dimension), to the inner dimension given by -// `expandedDimIndexToInterleaveAt`. -// -// Example: -// Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] } -// Input srcIndex = 1 -// Input expandedDimIndexToInterleaveAt = 1 -// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] } -// -static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex, - int expandedDimIndexToInterleaveAt) { - // Compute which inner dimension to permute the current outer dimension into. - int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); - int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt; - - SmallVector outPermutation(swizzle.permutation.size()); - // The leading dimension, permutation[0], gets moved inwards to the - // position that we just computed, dstIndexToInterleaveAt. - outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0]; - // Outer dimensions get shifted outwards to fill the gap. - for (int i = 0; i < dstIndexToInterleaveAt; ++i) { - outPermutation[i] = swizzle.permutation[i + 1]; - } - // Inner dimensions don't change. That is to say that we only interleave - // at `targetInterleavedElements` granularity, we don't swizzle further - // internally to that. - for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) { - outPermutation[i] = swizzle.permutation[i]; - } - swizzle.permutation = outPermutation; -} - // Returns the index of the dimension whose flattened size (flattening inner // dimensions into it) matches the given `targetSize`. This is used to compute // interleaving indices. @@ -174,16 +63,16 @@ static int64_t getDimIdxForTargetSize(const SmallVector &shape, // Generates the swizzle for the full data-tiled-mma tile, including all the // relevant unrolling factors. -static MaterializeEncodingInfo::Swizzle -getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) { +static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma, + IREE::GPU::MMAFragment fragment) { auto [AType, BType, CType] = mma.getABCElementTypes(); int ABits = AType.getIntOrFloatBitWidth(); int BBits = BType.getIntOrFloatBitWidth(); // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded. const int targetPreferredLoadBitWidth = 128; - auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx); - switch (operandIdx) { - case 0: + auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment); + switch (fragment) { + case IREE::GPU::MMAFragment::Lhs: // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1). // Unroll on K with interleaving, then on M. if (mma.getUnrollK() > 1) { @@ -197,7 +86,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) { unroll(swizzle, 0, mma.getUnrollM()); } break; - case 1: + case IREE::GPU::MMAFragment::Rhs: // B-matrix (RHS). Since the pack ops already took care of transposing B, // source dimensions are N (index 0) and K (index 1). // Unroll on K with interleaving, then on N. @@ -212,7 +101,7 @@ getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) { unroll(swizzle, 0, mma.getUnrollN()); } break; - case 2: + case IREE::GPU::MMAFragment::Acc: // C-matrix (accumulator). Source dimensions are M (index 0) and N (index // 1). Unroll on N, then on M. if (mma.getUnrollN() > 1) { @@ -310,8 +199,9 @@ materializeEncodingForTarget(RankedTensorType tensorType, TileMxNxK innerTile; std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape(); auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile); - auto operandIdx = encoding.getOperandIndex().getInt(); - encodingInfo.swizzle = getSwizzle(*mma, operandIdx); + auto fragment = + static_cast(encoding.getOperandIndex().getInt()); + encodingInfo.swizzle = getSwizzle(*mma, fragment); return encodingInfo; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp new file mode 100644 index 000000000000..b225e691fcea --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp @@ -0,0 +1,137 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" + +namespace mlir::iree_compiler { + +// Given an `expandShape` vector-of-vectors describing the mapping from source +// dimensions to expanded dimensions, returns the index of the first expanded +// dimension corresponding to the given source dimension index. +static int64_t +getExpandedDimFirstIdx(const SmallVector> &expandShape, + int64_t srcIndex) { + int dstIndexFirst = 0; + for (int i = 0; i < srcIndex; ++i) { + dstIndexFirst += expandShape[i].size(); + } + return dstIndexFirst; +} + +void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor) { + assert(unrollFactor > 1); + int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); + + // The new unrolling dimension is inserted at the start of the expandShape + // dimensions group corresponding to srcIndex. + swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(), + unrollFactor); + // Since we are not interleaving here, generating side-by-side copies of the + // original layout, the new unrolling dimension is the new outermost + // dimension. Existing entries get shifted to make room for it. + for (auto &p : swizzle.permutation) { + p += (p >= dstIndexFirst); + } + swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst); +} + +void interleave(TileSwizzle &swizzle, int srcIndex, + int expandedDimIndexToInterleaveAt) { + // Compute which inner dimension to permute the current outer dimension into. + int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); + int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt; + + SmallVector outPermutation(swizzle.permutation.size()); + // The leading dimension, permutation[0], gets moved inwards to the + // position that we just computed, dstIndexToInterleaveAt. + outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0]; + // Outer dimensions get shifted outwards to fill the gap. + for (int i = 0; i < dstIndexToInterleaveAt; ++i) { + outPermutation[i] = swizzle.permutation[i + 1]; + } + // Inner dimensions don't change. That is to say that we only interleave + // at `targetInterleavedElements` granularity, we don't swizzle further + // internally to that. + for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) { + outPermutation[i] = swizzle.permutation[i]; + } + swizzle.permutation = outPermutation; +} + +// Returns the permutation of indices that sorts `v` with the given comparator. +template