Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Experimental] Update deep tiled matmul for Parallelize Graphs on multiple numa nodes #153

Open
wants to merge 5 commits into
base: zhicong/run_pipeline_with_tuner
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion include/gc/Analysis/MatmulConfigAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,29 @@ struct SystemDesc {
// get runtime OMP_NUM_THREADS
uint32_t getNumThreads() {
char *numThreads = getenv("OMP_NUM_THREADS");
if (numThreads) {
if (!threads_limited && numThreads) {
return std::stoi(numThreads);
}
return curThreads;
}

// set the expected threads
void limitOnSingleNode(uint32_t numa_node) {
char *cacheSize = getenv("NUMA_THREADS");
if (cacheSize) {
curThreads = std::stoi(cacheSize);
threads_limited = true;
}
}

uint32_t getNumNodes() {
char *numThreads = getenv("OMP_NUM_THREADS");
if (threads_limited && numThreads) {
return std::stoi(numThreads) / curThreads;
}
return 1;
}

// get cache size by cacheLevel
size_t getCacheSize(uint8_t cacheLevel) {
if (cacheLevel == 1) {
Expand All @@ -57,6 +75,10 @@ struct SystemDesc {
SmallVector<size_t> getContractionOperationMaxVectorLength() {
return {512UL, 512UL};
}

private:
uint32_t curThreads = 1;
bool threads_limited = false;
};

struct MatmulConfig {
Expand Down
6 changes: 6 additions & 0 deletions lib/gc/Analysis/MatmulConfigAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,12 @@ previous matmul
MatmulConfigAnalysis::MatmulConfigAnalysis(Operation *root) {
SystemDesc sysDesc;
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(root)) {
// Check if the operation has an attribute named 'splited'
auto splitedAttr = linalgOp->getAttrOfType<IntegerAttr>("splited");
if (splitedAttr) {
sysDesc.limitOnSingleNode(splitedAttr.getInt());
llvm::outs() << "splited mm, and should be allocated on numa node 0.\n";
}
auto oprandDimType = *getOprandDimType(linalgOp);
// get the origin M,N,K size
auto MDimTypeIdx = extractDimTypeIdx(oprandDimType[0], DimType::M);
Expand Down
64 changes: 61 additions & 3 deletions lib/gc/Transforms/DeepTileContractionNamedOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,23 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
else
tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), tile);
}
llvm::outs() << "====================================\n";
llvm::outs() << "tileSize: ";
for (auto t : tileSizes) {
llvm::outs() << t << ", ";
}
llvm::outs() << "\n";
bool isLastForAllLoop = false;
for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
if (isConstantIntValue(tile, 0)) {
break;
}
if (idx == tileSizes.size() - 1)
isLastForAllLoop = true;
}

llvm::outs() << "isLastForAllLoop: " << isLastForAllLoop << "\n";
llvm::outs() << "====================================\n";
SmallVector<Range> loopRanges =
cast<TilingInterface>(currentOp.getOperation()).getIterationDomain(b);
OpBuilder::InsertionGuard guard(b);
Expand All @@ -482,7 +499,6 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
tileSizes[idx] = loopRanges[idx].size;
}
}

SmallVector<OpFoldResult> newParallelDims;
for (auto i = 0UL; i < reductionDims.size(); i++) {
newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
Expand All @@ -503,6 +519,43 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
}
}
}
if (isLastForAllLoop) {
b.setInsertionPointAfter(currentOp);
mlir::easybuild::EasyBuilder eb{b, currentOp.getLoc()};
auto cond = eb(true);
auto forAllOp = tilingResult->loops;
auto ifOp = b.create<mlir::scf::IfOp>(currentOp.getLoc(), cond);
b.setInsertionPointToStart(&ifOp.getThenRegion().front());
b.setInsertionPointAfter(ifOp);
// auto loc = currentOp.getLoc();
// auto indexType = b.getIndexType();
// auto c1 = b.create<arith::ConstantIndexOp>(loc, 0);

// Get the argument to compare with
// Value arg2 = forAllOp.getRegion().getArgument(
// 0); // This assumes %arg2 is the first argument
// Value comparison =
// b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, arg2,
// c1);

// // Create the scf.if operation
// b.setInsertionPointToStart(&forAllOp.getRegion().front());
// auto ifOp = b.create<scf::IfOp>(loc, comparison,
// /*withElseRegion=*/false);

// // Move the body of forallOp into the if true region
// b.inlineRegionBefore(forAllOp.getRegion(), ifOp.getThenRegion(),
// ifOp.getThenRegion().begin());

// // Now the body of forallOp is in the ifOp, we should clean up the
// // original region.
// forAllOp.getRegion().dropAllReferences();
// // forAllOp.getRegion().clear();

// // Insert a yield operation to the scf.if operation's then region
// b.setInsertionPointToEnd(&ifOp.getThenRegion().front());
// b.create<scf::YieldOp>(loc);
}
} else if (auto tilingInterface =
cast<TilingInterface>(currentOp.getOperation())) {
auto tilingResult = linalg::tileToForallOpUsingTileSizes(
Expand Down Expand Up @@ -595,6 +648,11 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
auto NOuterBlockSize = NDimPos.size() > 1
? (cfg.NBlock - 1) / cfg.innerMostNBlock + 1
: cfg.NBlock;
// Outermost Numa loop
option.nestedTileSizes.emplace_back(
SmallVector<size_t>{uint32_t(MFirstDim / 2)});
option.loopType.emplace_back(OuterLoopGenerationOption::LoopType::ForallOp);
option.loopDim.emplace_back(SmallVector<size_t>{MDimPos[0]});
// Outer
option.nestedTileSizes.emplace_back(SmallVector<size_t>{
MParallelBlockSize, NParallelBlockSize, KParallelBlockSize});
Expand Down Expand Up @@ -906,8 +964,8 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
llvm::isa<linalgx::Mm2DVnniOp>(linalgOp) ||
llvm::isa<linalgx::Mm4DVnniOp>(linalgOp) ||
llvm::isa<linalgx::MultiBatchMatmulOp>(linalgOp) ||
llvm::isa<linalg::BatchMatmulOp>(linalgOp) ||
llvm::isa<linalg::GenericOp>(linalgOp);
llvm::isa<linalg::BatchMatmulOp>(linalgOp); // ||
// llvm::isa<linalg::GenericOp>(linalgOp);
}

LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
Expand Down
75 changes: 65 additions & 10 deletions lib/gc/Transforms/Tiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,22 @@ FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
}

FailureOr<TilingResult>
getTiledImplementationOnNuma(Operation *op, OpBuilder &b,
ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes) {
// Leave the `sizeBounds` value empty. That is only needed when the `sizes`
// specified could lead to out of bounds accesses.
Location loc = op->getLoc();
LinalgOp linalgOp = cast<LinalgOp>(op);
SmallVector<Value> valuesToTile = linalgOp->getOperands();

SmallVector<Type> resultTensorTypes =
getTensorOutputTypes(linalgOp, valuesToTile);
Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, valuesToTile);
return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
}

FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
RewriterBase &b, PartialReductionOpInterface op,
ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
Expand Down Expand Up @@ -964,6 +980,17 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
// 4.b. Clone the op and update init operands.
// We cannot use a IRMapping here because it can replace
// different OpOperands with the same value.
bool isNumaLoop = false;
if (tileSizes.size() == iterationDomain.size()) {
for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
if (idx == 0 && tileSizes[idx] == iterationDomain[idx].size)
break;
if (idx > 0 && tileSizes[idx] != iterationDomain[idx].size)
break;
if (idx == tileSizes.size() - 1)
isNumaLoop = true;
}
}
Operation *clonedOp = b.clone(*op.getOperation());
b.modifyOpInPlace(clonedOp, [&]() {
for (auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
Expand All @@ -974,17 +1001,32 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
});
// 5. Tile the cloned op and delete the clone.
if (tileSizes.empty() || threadNums.empty()) {
FailureOr<TilingResult> tilingResult =
cast<TilingInterface>(clonedOp).getTiledImplementation(
b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError("expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
if (!isNumaLoop) {
FailureOr<TilingResult> tilingResult =
cast<TilingInterface>(clonedOp).getTiledImplementation(
b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError(
"expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
} else {
FailureOr<TilingResult> tilingResult = getTiledImplementationOnNuma(
cast<TilingInterface>(clonedOp), b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError(
"expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
} else {
LinalgTilingOptions options;
FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
Expand Down Expand Up @@ -1039,6 +1081,19 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
nonZeroDimIdx++;
}
}
if (auto attr = resultSizesRank[0].dyn_cast<Attribute>()) {
if (auto intAttr = attr.dyn_cast<IntegerAttr>()) {
if (intAttr.getInt() == 16)
resultSizesRank[0] = b.getIndexAttr(32);
}
} else if (auto value = resultSizesRank[0].dyn_cast<Value>()) {
if (auto constantOp = value.getDefiningOp<arith::ConstantOp>()) {
if (auto intAttr = constantOp.getValue().dyn_cast<IntegerAttr>()) {
if (intAttr.getInt() == 16)
resultSizesRank[0] = b.getIndexAttr(32);
}
}
}
if (hasReductionThreads) {
for (auto [parallelDims, redVar] :
llvm::zip(constantNewParallelDims, reductionInductionVars)) {
Expand Down
18 changes: 18 additions & 0 deletions scripts/generate_single_matmul_mlir.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ def generate_single_matmul_mlir(M, N, K):
'''
return mlir_code

def generate_single_matmul_mlir_wo_data(M, N, K):
mat_A = numpy.random.rand(M, K)
mat_B = numpy.random.rand(K, N)
mat_C = numpy.dot(mat_A, mat_B)
block_start = "{"
block_end = "}"
mlir_code = f'''
func.func @main_entry(%arg0: tensor<{M}x{K}xf32>, %arg1: tensor<{K}x{N}xf32> ) -> tensor<{M}x{N}xf32> attributes {block_start}llvm.emit_c_interface{block_end} {block_start}
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<{M}x{N}xf32>
%1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<{M}x{N}xf32>) -> tensor<{M}x{N}xf32>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<{M}x{K}xf32>, tensor<{K}x{N}xf32>) outs(%1 : tensor<{M}x{N}xf32>) -> tensor<{M}x{N}xf32>
return %2 : tensor<{M}x{N}xf32>
{block_end}
'''
return mlir_code

def generate_mlir_bf16_2dx4d(M, N, K, tile_m = 32, tile_n = 32, tile_k = 32, dtype_size=2):
M_block = (M-1) // tile_m + 1
Expand Down Expand Up @@ -123,6 +139,8 @@ def generate_mlir_f32_4dx4d_generic(M, N, K, tile_m = 32, tile_n = 32, tile_k =
args = parser.parse_args()
if args.mode == "correctness":
code = generate_single_matmul_mlir(args.M, args.N, args.K)
elif args.mode == "f32_2dx2d":
code = generate_single_matmul_mlir_wo_data(args.M, args.N, args.K)
elif args.mode == "bf16_2dx4d":
code = generate_mlir_bf16_2dx4d(args.M, args.N, args.K, args.tile_m, args.tile_n, args.tile_k)
elif args.mode == "bf16_4dx4d":
Expand Down
19 changes: 13 additions & 6 deletions scripts/run_all.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
set -ex
export PYTHONPATH=`pwd`/python_packages/tpp_core
export LD_PRELOAD=/home/zhangyan/miniforge3/envs/gc/lib/libiomp5.so
export MLIR_RUNNER_UTILS=/home/zhangyan/graph_compiler_v2/externals/llvm-project/llvm-install/lib/libmlir_runner_utils.so
export MLIR_C_RUNNER_UTILS=/home/zhangyan/graph_compiler_v2/externals/llvm-project/llvm-install/lib/libmlir_runner_utils.so
BUILD_DIR=${PROJECT_DIR}/build

export L1_CACHE_SIZE=49152
export L2_CACHE_SZIE=2097152
export L3_CACHE_SIZE=1966080
PROJECT_DIR=`pwd`/../
BUILD_DIR=${PROJECT_DIR}/build
export L3_CACHE_SIZE=335544320
export PROJECT_DIR=/home/zhangyan/graph_compiler_v2

export PYTHONPATH=${PROJECT_DIR}/build/python_packages/gc_mlir_core
export LD_PRELOAD="/home/zhicong/miniforge3/lib/libiomp5.so ${PROJECT_DIR}/build/lib/libGCCpuRuntime.so"
export LD_PRELOAD=$LD_PRELOAD:"/home/zhangyan/miniforge3/envs/gc/lib/libiomp5.so"
export LD_PRELOAD=$LD_PRELOAD:"/home/zhangyan/graph_compiler_v2/build/lib/libGCCpuRuntime.so"
export MLIR_RUNNER_UTILS=${PROJECT_DIR}/externals/llvm-project/build/lib/libmlir_runner_utils.so
export MLIR_C_RUNNER_UTILS=${PROJECT_DIR}/externals/llvm-project/build/lib/libmlir_c_runner_utils.so

Expand All @@ -16,14 +23,14 @@ cd $BUILD_DIR
echo "thread, dtype, bs, hidden_size, tile, time(ms), GFlops, extra, cmd"
for tile in 32 64 128
do
for thread in 1 32 56
for thread in 32
do
for mode in f32_4dx4d_generic bf16_4dx4d
do

for hidden_size in 4096x4096 4096x11008 11008x4096 4096x32000
do
for bs in 1 16 32 64 512
for bs in 32
do
export OMP_NUM_THREADS=$thread
M_SIZE=$bs
Expand Down