Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Post RA software pipeliner #146

Open
wants to merge 8 commits into
base: aie-public
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions llvm/include/llvm/CodeGen/ResourceScoreboard.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,30 @@ template <typename RC> class ResourceScoreboard {
}
RC &operator[](int Cycle) { return Cycles[(Head + Cycle) & (Size - 1)]; }

void reset(int D = 1) {
void clear() {
assert(Size);
Cycles.clear();
Cycles.resize(Size);
Head = 0;
}

void reset(int D) {
// Implementation relies on masking to wrap-around, so round up
// to a power of two.
int Pow2 = 1;
while (Pow2 < D) {
Pow2 += Pow2;
}
if (Cycles.empty()) {
Depth = Pow2;
Size = 2 * Depth;
}
Cycles.clear();
Cycles.resize(Size);
Head = 0;
Depth = Pow2;
Size = 2 * Depth;
clear();
}
bool isValidDelta(int DeltaCycles) const {
return DeltaCycles >= -Depth && DeltaCycles <= 0;
}

void advance() {
(*this)[0].clearResources();
(*this)[-Depth].clearResources();
Head = (Head + 1) & (Size - 1);
}
void recede() {
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1185,13 +1185,15 @@ bool AIE2InstrInfo::isHardwareLoopEnd(unsigned Opcode) const {
return Opcode == AIE2::PseudoLoopEnd;
}

bool AIE2InstrInfo::isZOLTripCountDef(const MachineInstr &MI) const {
bool AIE2InstrInfo::isZOLTripCountDef(const MachineInstr &MI,
bool Pristine) const {
return MI.getOpcode() == AIE2::ADD_NC &&
MI.getOperand(0).getReg() == AIE2::LC;
MI.getOperand(0).getReg() == AIE2::LC &&
(!Pristine || MI.getOperand(2).getImm() == 0);
}

void AIE2InstrInfo::adjustTripCount(MachineInstr &MI, int Adjustment) const {
assert(MI.getOpcode() == AIE2::ADD_NC);
assert(isZOLTripCountDef(MI));
auto &Imm = MI.getOperand(2);
martien-de-jong marked this conversation as resolved.
Show resolved Hide resolved
Imm.setImm(Imm.getImm() + Adjustment);
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AIE/AIE2InstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
virtual bool isHardwareLoopStart(unsigned Opcode) const override;
virtual bool isHardwareLoopEnd(unsigned Opcode) const override;

bool isZOLTripCountDef(const MachineInstr &MI) const override;
bool isZOLTripCountDef(const MachineInstr &MI,
bool Pristine = false) const override;
void adjustTripCount(MachineInstr &MI, int Adjustment) const override;

virtual bool
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AIE/AIE2InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def LoopJNZ :
// Zero overhead loops
let hasSideEffects = true, mayLoad = false, mayStore = false,
isNotDuplicable = true in {
def LoopStart : Pseudo<(outs), (ins eR:$src), "loop_start", "${src}">;
def LoopStart : Pseudo<(outs), (ins eR:$src, simm6:$adj), "loop_start", "${src}, ${adj}">;

// PseudoLoopEnd represents the terminator of a ZOL. You can view it as a
// conditional branch to the loop body. We make this a meta instruction,
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,8 @@ bool AIE2InstructionSelector::select(MachineInstr &I) {
case Intrinsic::start_loop_iterations:
return selectStartLoop(I, MRI);
case Intrinsic::set_loop_iterations: {
auto LS = MIB.buildInstr(AIE2::LoopStart, {}, {I.getOperand(1)});
auto LS =
MIB.buildInstr(AIE2::LoopStart, {}, {I.getOperand(1)}).addImm(0);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*LS, TII, TRI, RBI);
}
Expand Down
62 changes: 59 additions & 3 deletions llvm/lib/Target/AIE/AIEBaseHardwareLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,57 @@ char AIEBaseHardwareLoops::ID = 0;
INITIALIZE_PASS(AIEBaseHardwareLoops, DEBUG_TYPE, AIE_HARDWARE_LOOPS_NAME,
false, false)

namespace {
// Everything becomes much simpler if ZOLs exit to their layout successor
// We can achieve that by splitting a non-fallthrough exit jump off into
// a new fallthrough block.
gbossu marked this conversation as resolved.
Show resolved Hide resolved
// Note that we don't actually check the block layout; we are near the end of
// the codegen pipeline, and we assume that explicit jumps to a fallthrough
// block don't occur.
bool splitLoopEndJump(MachineBasicBlock &MBB, const AIEBaseInstrInfo *TII) {
auto Terminator = MBB.getFirstInstrTerminator();
if (Terminator == MBB.end() ||
!TII->isHardwareLoopEnd((*Terminator).getOpcode())) {
return false;
}
SmallVector<MachineOperand, 4> Cond;
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
const bool AllowModify = false;
if (TII->analyzeBranch(MBB, TBB, FBB, Cond, AllowModify)) {
// can't analyze
return false;
}
if (!FBB) {
gbossu marked this conversation as resolved.
Show resolved Hide resolved
// Is fallthrough already
return false;
}

// So:
// 1. Create a new block in fallthrough position
// 2. remove the entire control flow
// 3. add back the loopend to this block
// 4. add back the jump to the new block
// 5. fix successors
auto *MF = MBB.getParent();
MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(FBB->getBasicBlock());
MF->insert(++MBB.getIterator(), NewBB);
TII->removeBranch(MBB);
DebugLoc DL;
TII->insertBranch(MBB, TBB, nullptr, Cond, DL);
Cond.clear();
TII->insertBranch(*NewBB, FBB, nullptr, Cond, DL);
for (auto *Edge : make_early_inc_range(MBB.successors())) {
if (Edge == FBB) {
MBB.removeSuccessor(FBB);
}
}

NewBB->addSuccessor(FBB);
MBB.addSuccessor(NewBB);
return true;
}
} // namespace

bool AIEBaseHardwareLoops::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
LLVM_DEBUG(dbgs() << "AIE Hardware Loops on " << MF->getName()
Expand All @@ -186,6 +237,10 @@ bool AIEBaseHardwareLoops::runOnMachineFunction(MachineFunction &mf) {
TRI = mf.getSubtarget().getRegisterInfo();

bool Changed = false;
for (auto &MBB : *MF) {
Changed |= splitLoopEndJump(MBB, TII);
}

for (auto *ML : *MLI) {
if (ML->isOutermost())
Changed |= processLoop(ML);
Expand Down Expand Up @@ -298,11 +353,12 @@ void AIEBaseHardwareLoops::expandLoopStart(LowOverheadLoop &LoLoop) {
MachineBasicBlock *MBB = Start->getParent();
LLVM_DEBUG(dbgs() << "AIE Loops: ZOL loop. Expanding LoopStart.\n");

// We use ADD_NC, which allows PostPipeliner to tweak it by modifying the
// immediate value.
// LoopStart carries an immediate operand that is dedicated to the tripcount
// update of the pipeliner. We translate to ADD_NC, which has a similar
// operand.
BuildMI(*MBB, Start, Start->getDebugLoc(), TII->get(AIE2::ADD_NC), AIE2::LC)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We merged e33bbe1 as a bug fix for the trip count adjustment. As this change also overcomes the same problem, we can replace later (revert the first fix, for example).

.addReg(Start->getOperand(0).getReg())
.addImm(0);
.addImm(Start->getOperand(1).getImm());

BuildMI(*MBB, Start, Start->getDebugLoc(), TII->get(AIE2::MOVXM_lng_cg),
AIE2::LS)
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,13 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
virtual bool isHardwareLoopStart(unsigned Opcode) const { return false; }
virtual bool isHardwareLoopEnd(unsigned Opcode) const { return false; }

/// Check whether this defines the ZOL tripcount
virtual bool isZOLTripCountDef(const MachineInstr &MI) const { return false; }
/// Check whether \p MI defines the ZOL tripcount. If this returns true, \p MI
/// should be suitable for calling adjustTripCount on it.
/// If \p Pristine is set, we check that it wasn't updated before.
virtual bool isZOLTripCountDef(const MachineInstr &MI,
bool Pristine = false) const {
return false;
}

/// Lower the tripcount defined by MI with Update, which is a small
/// negative integer that should be added to the tripcount
Expand Down
84 changes: 53 additions & 31 deletions llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,10 @@ cl::opt<bool> LoopWholeLoopGuard(
cl::desc("Allow SWP schedules requiring a guard around the whole loop"),
cl::init(true), cl::Hidden);

// If we hoist we can get a better performance (no clear evidence
// of the reason). If we don't hoist, we can change the LoopsStart expansion
// to reuse this non-hoisted add.
cl::opt<bool> HoistZOLAdjust(
"aie-pipeliner-hoist-zol-adjustment",
cl::desc("Host the trip count adjustment for ZOL (when possible)"),
cl::init(false), cl::Hidden);
cl::opt<int> PostPipelinerCandidateLimit(
"aie-postpipeliner-limit",
cl::desc("II below which postpipeliner preference kicks in"), cl::init(2),
cl::Hidden);

AIEBasePipelinerLoopInfo::AIEBasePipelinerLoopInfo(MachineInstr *EndLoop,
const AIEBaseInstrInfo &TII)
Expand Down Expand Up @@ -661,6 +658,9 @@ class ZeroOverheadLoop : public AIEBasePipelinerLoopInfo {
MachineInstr *DefTripCount;
MachineBasicBlock *LoopStartBlock;

// Decide whether the postpipeliner may do a better job
bool preferPostPipeliner(SMSchedule &SMS);

public:
ZeroOverheadLoop(MachineInstr *EndLoop, const AIEBaseInstrInfo &TII)
: AIEBasePipelinerLoopInfo(EndLoop, TII) {}
Expand All @@ -672,6 +672,8 @@ class ZeroOverheadLoop : public AIEBasePipelinerLoopInfo {
SmallVectorImpl<MachineOperand> &Cond) override;

bool canAcceptII(SMSchedule &SMS) override;

bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS) override;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably update canAcceptII as well. Typically, this will increase the II until we have a low enough stage count. So I think that if we find a schedule with a low II and high stage count, we should immediately refuse it sop the post-pipeliner can pick it; and not increase the II.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an interface change I think? it should be able to say yes, higherII and stop. That would definitely save some time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we would accept the II if we are confident we can pick up the loop in the post-pipeliner. And for those loops, shouldUseSchedule would return false.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow that makes sense, except for the name of canAcceptII. I will add a fat comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, something can accept it, just not the pre-pipeliner 😄

};

ZeroOverheadLoop::Assessment ZeroOverheadLoop::accept(MachineInstr *EndLoop) {
Expand Down Expand Up @@ -753,32 +755,34 @@ std::optional<bool> ZeroOverheadLoop::createTripCountGreaterCondition(

void ZeroOverheadLoop::adjustTripCount(int TripCountAdjust) {
LLVM_DEBUG(dbgs() << "TripCountAdjust = " << TripCountAdjust << "\n");
if (DefTripCount->getOperand(1).isImm() &&
MRI.hasOneUse(DefTripCount->getOperand(0).getReg())) {
// If we have a constant here, just update the value.
const int64_t InitVal = DefTripCount->getOperand(1).getImm();
DefTripCount->getOperand(1).setImm(InitVal + TripCountAdjust);
} else {
// Otherwise, add the value.
Register Reg = DefTripCount->getOperand(0).getReg();
Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
MachineBasicBlock::iterator InsertPoint = Init->getIterator();
MachineInstr *AdjacentInstr = Init;

if (HoistZOLAdjust) {
// Insert the adjustment just after the instruction that defines it.
// Probably it will be hoisted.
AdjacentInstr = DefTripCount;
InsertPoint = DefTripCount->getIterator();
InsertPoint++;
}

BuildMI(*AdjacentInstr->getParent(), InsertPoint,
AdjacentInstr->getDebugLoc(), TII.get(AIE2::ADD_NC_GPR), NewReg)
.addReg(Reg)
.addImm(TripCountAdjust);
Init->getOperand(0).setReg(NewReg);
// LoopStart has a small immediate addend that can accommodate the adjustment
Init->getOperand(1).setImm(TripCountAdjust);
}

bool ZeroOverheadLoop::preferPostPipeliner(SMSchedule &SMS) {
// Zero overhead loops are candidates for PostPipeliner, which does a better
// job on multi-stage live-ranges without spilling or moving.
// Spanning multiple stages requires a latency that is longer than the II.
// We apply some heuristic upper limit for this rejection criterion.
// CHECK: We assume that the resulting II can be smaller than max(latency).
// When not, we may need ResMII for this check.
int II = SMS.getInitiationInterval();
if (II >= PostPipelinerCandidateLimit) {
return false;
}

for (int C = 0; C < II; C++) {
for (auto *SU : SMS.getInstructions(C)) {
for (auto &SDep : SU->Succs) {
if (SDep.getSignedLatency() >= II) {
LLVM_DEBUG(dbgs() << "PLI: Leaving low-II for PostPipeliner\n");
return true;
}
}
}
}
return false;
}

bool ZeroOverheadLoop::canAcceptII(SMSchedule &SMS) {
Expand All @@ -790,9 +794,27 @@ bool ZeroOverheadLoop::canAcceptII(SMSchedule &SMS) {
return false;
}

// If we think the postpipeliner can do better, accept it here to prevent
// doing more work than necessary. The final verdict in shouldUseSchedule
// will reject it on the same grounds
if (preferPostPipeliner(SMS)) {
return true;
}

return AIEBasePipelinerLoopInfo::canAcceptII(SMS);
}

bool ZeroOverheadLoop::shouldUseSchedule(SwingSchedulerDAG &SSD,
SMSchedule &SMS) {
// If AIEBasePipelinerLoopInfo refuses it, let's conservatively
// keep the decision.
if (!AIEBasePipelinerLoopInfo::shouldUseSchedule(SSD, SMS)) {
return false;
}

return !preferPostPipeliner(SMS);
}

} // namespace

bool AIEBasePipelinerLoopInfo::canAcceptII(SMSchedule &SMS) {
Expand Down
21 changes: 19 additions & 2 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "AIEBaseSubtarget.h"
#include "AIE2Subtarget.h"
#include "AIEBaseRegisterInfo.h"
#include "AIEInterBlockScheduling.h"
#include "AIEMachineScheduler.h"
#include "AIEMaxLatencyFinder.h"
#include "AIESubtarget.h"
Expand Down Expand Up @@ -262,8 +263,24 @@ class RegionEndEdges : public ScheduleDAGMutation {
// there must be a distance of 112 bytes in terms of PM addresses.
// 112 bytes correspond to 7 fully-expanded 128-bit instructions and
// hence adding a latency of 8 from LoopStart to the ExitSU.
if (TII->isZeroOverheadLoopSetupInstr(MI))
EdgeLatency = 8;
// We can subtract the number of bundles that interblock pushed into
// BottomInsert
// FIXME: this holds as long as we insert them unconditionally. If we
// integrate them with the bottom region, we just need to keep 8 away
// from ExitSU
if (TII->isZeroOverheadLoopSetupInstr(MI)) {
unsigned PatchCycles = 8;
if (DAG->getBB()) {
auto *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto &InterBlock = Scheduler->getInterBlock();
unsigned InsertedCycles =
InterBlock.getBlockState(DAG->getBB()).BottomInsert.size();
PatchCycles =
PatchCycles >= InsertedCycles ? PatchCycles - InsertedCycles : 0;
}
EdgeLatency = std::max(EdgeLatency, PatchCycles);
}

ExitDep.setLatency(EdgeLatency);
DAG->ExitSU.addPred(ExitDep, /*Required=*/true);
Expand Down
Loading
Loading