Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Change control flow intrinsic lowering making the wave to re… #86805

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
[llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
[IntrWillReturn, IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Represent unreachable in a divergent region.
def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,11 +1553,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
MachineBasicBlock *BB = MI.getParent();
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
.add(MI.getOperand(1));

Register Reg = MI.getOperand(1).getReg();
Expand Down Expand Up @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_end_cf:
return selectEndCfIntrinsic(I);
case Intrinsic::amdgcn_wave_reconverge:
return selectWaveReconvergeIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectReturnAddress(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I) const;

bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4954,7 +4954,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_end_cf: {
case Intrinsic::amdgcn_wave_reconverge: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
Expand Down
92 changes: 51 additions & 41 deletions llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
Expand Down Expand Up @@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass {
Function *Else;
Function *IfBreak;
Function *Loop;
Function *EndCf;
Function *WaveReconverge;

DominatorTree *DT;
StackVector Stack;
Expand Down Expand Up @@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass {

bool handleLoop(BranchInst *Term);

bool closeControlFlow(BasicBlock *BB);
bool tryWaveReconverge(BasicBlock *BB);

public:
static char ID;
Expand Down Expand Up @@ -141,7 +142,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
{ IntMask });
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask });
}

/// Is the branch condition uniform or did the StructurizeCFG pass
Expand Down Expand Up @@ -203,8 +204,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {

/// Open a new "If" block
bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return false;

IRBuilder<> IRB(Term);
Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
Expand Down Expand Up @@ -305,43 +304,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
}

/// Close the last opened control flow
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);
bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {

assert(Stack.back().first == BB);
if (succ_empty(BB))
return false;

if (L && L->getHeader() == BB) {
// We can't insert an EndCF call into a loop header, because it will
// get executed on every iteration of the loop, when it should be
// executed only once before the loop.
SmallVector <BasicBlock *, 8> Latches;
L->getLoopLatches(Latches);
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
if (Term->getNumSuccessors() == 1) {
// The current BBs single successor is a top of the stack. We need to
// reconverge over thaqt path.
BasicBlock *SingleSucc = *succ_begin(BB);
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();

SmallVector<BasicBlock *, 2> Preds;
for (BasicBlock *Pred : predecessors(BB)) {
if (!is_contained(Latches, Pred))
Preds.push_back(Pred);
if (isTopOfStack(SingleSucc)) {
Value *Exec = Stack.back().second;
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
}

BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
false);
}

Value *Exec = popSaved();
BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
Instruction *ExecDef = cast<Instruction>(Exec);
BasicBlock *DefBB = ExecDef->getParent();
if (!DT->dominates(DefBB, BB)) {
// Split edge to make Def dominate Use
FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
} else {
// We have a uniform conditional branch terminating the block.
// THis block may be the last in the Then path of the enclosing divergent
// IF.
if (!isUniform(Term))
// Divergent loop is going to be further processed in another place
return false;

for (auto Succ : Term->successors()) {
if (isTopOfStack(Succ)) {
// Just split to make a room for further WAVE_RECONVERGE insertion
SmallVector<BasicBlock*, 2> Preds;
for (auto P : predecessors(Succ)) {
if (DT->dominates(BB, P))
Preds.push_back(P);
}
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI,
nullptr, false);
}
}
IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
// TODO: StructurizeCFG 'Flow' blocks have debug locations from the
// condition, for now just avoid copying these DebugLocs so that stepping
// out of the then/else block in a debugger doesn't step to the condition.
IRB.SetCurrentDebugLocation(DebugLoc());
IRB.CreateCall(EndCf, {Exec});
}

return true;
Expand All @@ -365,14 +364,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {

if (!Term || Term->isUnconditional()) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();

Changed |= tryWaveReconverge(BB);

continue;
}

if (I.nodeVisited(Term->getSuccessor(1))) {
if (isTopOfStack(BB))
Changed |= closeControlFlow(BB);
Stack.pop_back();

// Let's take care of uniform loop latch that may be closing the Then
// path of the enclosing divergent branch.
Changed |= tryWaveReconverge(BB);

if (DT->dominates(Term->getSuccessor(1), BB))
Changed |= handleLoop(Term);
Expand All @@ -387,10 +392,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
continue;
}

Changed |= closeControlFlow(BB);
Stack.pop_back();
}

Changed |= openIf(Term);
if (isUniform(Term))
// Uniform conditional branch may be in the block that closes the Then
// path of the divergent conditional branch.
Changed |= tryWaveReconverge(BB);
else
Changed |= openIf(Term);
}

if (!Stack.empty()) {
Expand Down
34 changes: 30 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
return AMDGPUISD::ELSE;
case Intrinsic::amdgcn_loop:
return AMDGPUISD::LOOP;
case Intrinsic::amdgcn_end_cf:
case Intrinsic::amdgcn_wave_reconverge:
llvm_unreachable("should not occur");
default:
return 0;
Expand Down Expand Up @@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

return SDValue(Load, 0);
}
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
case Intrinsic::amdgcn_wave_reconverge:
return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_join:
Expand Down Expand Up @@ -15740,6 +15740,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
}

// ISel inserts copy to regs for the successor PHIs
// at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
// branch.
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
MachineBasicBlock::iterator I(MI);
MachineBasicBlock::iterator Next = std::next(I);
bool NeedToMove = false;
while (Next != MBB.end() && !Next->isBranch()) {
NeedToMove = true;
Next++;
}

assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
"Malformed CFG detected!\n");

if (NeedToMove) {
MBB.splice(Next, &MBB, &MI);
}

break;
}
}
}

// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
// classes if required. Ideally the register class constraints would differ
// per-subtarget, but there's no easy way to achieve that right now. This is
Expand Down Expand Up @@ -16336,7 +16362,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
default:
Result = false;
break;
case Intrinsic::amdgcn_end_cf:
case Intrinsic::amdgcn_wave_reconverge:
case Intrinsic::amdgcn_loop:
Result = true;
break;
Expand Down
31 changes: 30 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;

case AMDGPU::S_CMOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CMOV_B64));
break;

case AMDGPU::S_MOV_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_MOV_B32));
break;

case AMDGPU::S_CMOV_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CMOV_B32));
break;

case AMDGPU::S_CSELECT_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CSELECT_B32));
break;

case AMDGPU::S_CSELECT_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CSELECT_B64));
break;

case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
Expand Down Expand Up @@ -3088,20 +3112,25 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
while (I != E && !I->isBranch() && !I->isReturn()) {
switch (I->getOpcode()) {
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_CMOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
case AMDGPU::S_AND_SAVEEXEC_B64_term:
case AMDGPU::S_CSELECT_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_CMOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
case AMDGPU::S_AND_SAVEEXEC_B32_term:
case AMDGPU::S_CSELECT_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
case AMDGPU::SI_WAVE_RECONVERGE:
case AMDGPU::SI_KILL_I1_TERMINATOR:
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
// FIXME: It's messy that these need to be considered here at all.
Expand Down Expand Up @@ -8782,7 +8811,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
.add(Branch->getOperand(0))
.add(Branch->getOperand(1));
MachineInstr *SIEND =
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_WAVE_RECONVERGE))
.addReg(DstReg);

IfEntry->erase(TI);
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<

let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_CMOV_B64_term : WrapTerminatorInst<S_CMOV_B64>;
def S_CSELECT_B64_term : WrapTerminatorInst<S_CSELECT_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
Expand All @@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;

let WaveSizePredicate = isWave32 in {
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
def S_CMOV_B32_term : WrapTerminatorInst<S_CMOV_B32>;
def S_CSELECT_B32_term : WrapTerminatorInst<S_CSELECT_B32>;
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
Expand Down Expand Up @@ -475,9 +479,7 @@ def SI_LOOP : CFPseudoInstSI <
let IsNeverUniform = 1;
}

} // End isTerminator = 1

def SI_END_CF : CFPseudoInstSI <
def SI_WAVE_RECONVERGE : CFPseudoInstSI <
(outs), (ins SReg_1:$saved), [], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
Expand All @@ -488,6 +490,8 @@ def SI_END_CF : CFPseudoInstSI <
let mayStore = 1;
}

} // End isTerminator = 1

def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
Expand Down
Loading
Loading