From ad8f814a14aa9766b82d80a68f73155d4878d04c Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Tue, 17 Sep 2024 19:46:55 -0400 Subject: [PATCH] [LLVMGPU] Delete dead code in prefetch pass (#18543) The multi-stage prefetching was not used/tested. Also fix some typos. --- .../Utils/PrefetchSharedMemoryCopy.cpp | 113 ++---------------- 1 file changed, 13 insertions(+), 100 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp index 544c73f5a695..5e1f539f1281 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp @@ -82,7 +82,6 @@ class LoopPrefetcher { prefetcher.mapping = SmallVector(4); prefetcher.forOp = op; prefetcher.lb = prefetcher.ub = prefetcher.step = 0; - prefetcher.singleStage = true; if (failed(prefetcher.initializeLoopInfo())) { LDBG("Failed to initialize loop info (unsupported loop)"); @@ -99,44 +98,20 @@ class LoopPrefetcher { // Emits the prologue before the main pipelined loop and returns the read // results to be passed to the main loop as initial loop carried values, and - // their useages by corresponding writes in the main loop. + // their usages by corresponding writes in the main loop. std::tuple, SmallVector> emitPrologue(RewriterBase &rewriter) { Location loc = forOp.getLoc(); Value zero = rewriter.create(loc, lb); - Value one = rewriter.create(loc, lb + step); SmallVector iterArgs; SmallVector readResults; SmallVector writeArgs; - if (singleStage) { - // If we only prefetch one step ahead, we can directly write in the - // prologue and use the shared memory to communicate data instead of the - // loop carried values. - // Read (0) - emitRead(mapping[0], rewriter, zero); - // Write(0) - emitWrite(mapping[0], rewriter, zero); - return {iterArgs, writeArgs}; - } - - // Read(0). - iterArgs = emitRead(mapping[0], rewriter, zero); - // Read(1). - readResults = emitRead(mapping[1], rewriter, one); - llvm::append_range(iterArgs, readResults); - - // Collect the values to be used as write args. - for (Operation *op : readStage) { - if (auto transferReadOp = dyn_cast(op)) { - for (Operation *user : transferReadOp.getResult().getUsers()) { - if (auto writeOp = dyn_cast(user)) { - writeArgs.push_back(writeOp.getVector()); - } - } - } - } - + // Directly write in the prologue and use the shared memory to communicate + // data instead of the loop carried values. Read (0) + emitRead(mapping[0], rewriter, zero); + // Write(0) + emitWrite(mapping[0], rewriter, zero); return {iterArgs, writeArgs}; } @@ -145,7 +120,7 @@ class LoopPrefetcher { SmallVector &newIterArgs, SmallVector &writeArgs) { Location loc = forOp.getLoc(); - int64_t newUpperBound = singleStage ? (ub - step) : (ub - 2 * step); + int64_t newUpperBound = ub - step; auto newUb = rewriter.create(loc, newUpperBound); // Keep original iter args and then add some for what's being loaded to @@ -154,9 +129,6 @@ class LoopPrefetcher { llvm::append_range(iterArgs, newIterArgs); Value newStep = forOp.getStep(); - if (!singleStage) { - newStep = rewriter.create(loc, newStep, newStep); - } auto newForOp = rewriter.create(loc, forOp.getLowerBound(), newUb, newStep, iterArgs); @@ -165,19 +137,6 @@ class LoopPrefetcher { if (!newForOp.getBody()->empty()) rewriter.eraseOp(newForOp.getBody()->getTerminator()); - if (singleStage) - return newForOp; - - SmallVector targetValues(writeArgs.size()); - for (size_t i = 0, e = writeArgs.size(); i != e; ++i) - targetValues[i] = newForOp.getRegionIterArg(i + 1); - - createWriteMappings(writeArgs, targetValues, mapping[0]); - - for (size_t i = 0, e = writeArgs.size(); i != e; ++i) - targetValues[i] = newForOp.getRegionIterArg(i + e + 1); - - createWriteMappings(writeArgs, targetValues, mapping[1]); return newForOp; } @@ -188,8 +147,6 @@ class LoopPrefetcher { Value indVar = newForOp.getInductionVar(); Value increment = rewriter.create(loc, step); Value iPlusOne = rewriter.create(loc, indVar, increment); - Value iPlusTwo = rewriter.create(loc, iPlusOne, increment); - Value iPlusThree = rewriter.create(loc, iPlusTwo, increment); for (int i = 0; i < 3; ++i) { for (auto [idx, arg] : llvm::enumerate(forOp.getRegionIterArgs())) { @@ -198,29 +155,13 @@ class LoopPrefetcher { } SmallVector readRegisters, moreRegisters; - if (singleStage) { - emitRead(mapping[1], rewriter, iPlusOne); - emitBarrier(loc, rewriter); - emitCompute(mapping[0], rewriter, indVar); - emitBarrier(loc, rewriter); - emitWrite(mapping[1], rewriter, iPlusOne); - updateYield(mapping[0], readRegisters, rewriter); - return; - } - - emitWrite(mapping[0], rewriter, indVar); - readRegisters = emitRead(mapping[2], rewriter, iPlusTwo); + emitRead(mapping[1], rewriter, iPlusOne); emitBarrier(loc, rewriter); - auto computeResults = emitCompute(mapping[0], rewriter, indVar); - mapping[0].map(forOp.getRegionIterArg(0), computeResults[0]); + emitCompute(mapping[0], rewriter, indVar); emitBarrier(loc, rewriter); emitWrite(mapping[1], rewriter, iPlusOne); - moreRegisters = emitRead(mapping[3], rewriter, iPlusThree); - emitBarrier(loc, rewriter); - emitCompute(mapping[0], rewriter, iPlusOne); - emitBarrier(loc, rewriter); - readRegisters.append(moreRegisters.begin(), moreRegisters.end()); updateYield(mapping[0], readRegisters, rewriter); + return; } // Emits the epilogue after the main pipelined loop and returns the final @@ -229,8 +170,6 @@ class LoopPrefetcher { SmallVector &writeArgs) { rewriter.setInsertionPointAfter(newForOp); Location loc = forOp.getLoc(); - Value nMinusTwo = - rewriter.create(loc, ub - 2 * step); Value nMinusOne = rewriter.create(loc, ub - 1 * step); @@ -239,32 +178,8 @@ class LoopPrefetcher { mapping[0].map(forOp.getRegionIterArg(i), newForOp.getResult(i)); } - if (singleStage) { - emitBarrier(loc, rewriter); - return emitCompute(mapping[0], rewriter, nMinusOne); - } - - SmallVector targetValues(writeArgs.size()); - for (size_t i = 0, e = writeArgs.size(); i != e; ++i) - targetValues[i] = newForOp.getResult(i + 1); - - createWriteMappings(writeArgs, targetValues, mapping[2]); - - for (size_t i = 0, e = writeArgs.size(); i != e; ++i) - targetValues[i] = newForOp.getResult(i + e + 1); - - createWriteMappings(writeArgs, targetValues, mapping[3]); - - emitWrite(mapping[2], rewriter, nMinusTwo); - emitBarrier(loc, rewriter); - SmallVector computeResults = - emitCompute(mapping[0], rewriter, nMinusTwo); - mapping[0].map(forOp.getRegionIterArg(0), computeResults[0]); - emitBarrier(loc, rewriter); - emitWrite(mapping[3], rewriter, nMinusOne); emitBarrier(loc, rewriter); - computeResults = emitCompute(mapping[0], rewriter, nMinusOne); - return computeResults; + return emitCompute(mapping[0], rewriter, nMinusOne); } private: @@ -310,7 +225,7 @@ class LoopPrefetcher { } // We only support loops whose bodies can be divided into 3 stages (read, - // write, compute). If there are any remaning ops with side effects (except + // write, compute). If there are any remaining ops with side effects (except // for gpu.barrier), the loop is not supported. LogicalResult initializeStages() { DenseSet readDependencies; @@ -372,7 +287,7 @@ class LoopPrefetcher { return success(); } - /// Clones |op| and call |callback| on the cloned op's oeprands as well as any + /// Clones |op| and call |callback| on the cloned op's operands as well as any /// operands of nested ops that 1) aren't defined within the new op or 2) are /// block arguments. static Operation * @@ -511,8 +426,6 @@ class LoopPrefetcher { scf::ForOp forOp; // Original static loop range and step. int64_t lb, ub, step; - // Whether we only prefetch one single step ahead. - bool singleStage; // Ops in the original scf.for loop that belongs to different classes. SmallVector readStage;