llvm · alex-t · Mar 27, 2024 · Mar 27, 2024 · Apr 2, 2024 · Apr 10, 2024
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
   [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
-  [IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
+  [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // Represent unreachable in a divergent region.
 def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1553,11 +1553,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
+bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
+    MachineInstr &MI) const {
   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
   // SelectionDAG uses for wave32 vs wave64.
   MachineBasicBlock *BB = MI.getParent();
-  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
+  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
       .add(MI.getOperand(1));
 
   Register Reg = MI.getOperand(1).getReg();
@@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
   switch (IntrinsicID) {
-  case Intrinsic::amdgcn_end_cf:
-    return selectEndCfIntrinsic(I);
+  case Intrinsic::amdgcn_wave_reconverge:
+    return selectWaveReconvergeIntrinsic(I);
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
     return selectDSOrderedIntrinsic(I, IntrinsicID);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectReturnAddress(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I) const;
 
-  bool selectEndCfIntrinsic(MachineInstr &MI) const;
+  bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4954,7 +4954,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
-    case Intrinsic::amdgcn_end_cf: {
+    case Intrinsic::amdgcn_wave_reconverge: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;

diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -15,6 +15,7 @@
 #include "GCNSubtarget.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass {
   Function *Else;
   Function *IfBreak;
   Function *Loop;
-  Function *EndCf;
+  Function *WaveReconverge;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   bool handleLoop(BranchInst *Term);
 
-  bool closeControlFlow(BasicBlock *BB);
+  bool tryWaveReconverge(BasicBlock *BB);
 
 public:
   static char ID;
@@ -141,7 +142,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
                                       { IntMask });
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
-  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
+  WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask });
 }
 
 /// Is the branch condition uniform or did the StructurizeCFG pass
@@ -203,8 +204,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
 
 /// Open a new "If" block
 bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  if (isUniform(Term))
-    return false;
 
   IRBuilder<> IRB(Term);
   Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
@@ -305,43 +304,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 }
 
 /// Close the last opened control flow
-bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
-  llvm::Loop *L = LI->getLoopFor(BB);
+bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
 
-  assert(Stack.back().first == BB);
+  if (succ_empty(BB))
+    return false;
 
-  if (L && L->getHeader() == BB) {
-    // We can't insert an EndCF call into a loop header, because it will
-    // get executed on every iteration of the loop, when it should be
-    // executed only once before the loop.
-    SmallVector <BasicBlock *, 8> Latches;
-    L->getLoopLatches(Latches);
+  BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+  if (Term->getNumSuccessors() == 1) {
+    // The current BBs single successor is a top of the stack. We need to
+    // reconverge over thaqt path.
+    BasicBlock *SingleSucc = *succ_begin(BB);
+    BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
 
-    SmallVector<BasicBlock *, 2> Preds;
-    for (BasicBlock *Pred : predecessors(BB)) {
-      if (!is_contained(Latches, Pred))
-        Preds.push_back(Pred);
+    if (isTopOfStack(SingleSucc)) {
+      Value *Exec = Stack.back().second;
+      IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
     }
-
-    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
-                                false);
-  }
-
-  Value *Exec = popSaved();
-  BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
-  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
-    Instruction *ExecDef = cast<Instruction>(Exec);
-    BasicBlock *DefBB = ExecDef->getParent();
-    if (!DT->dominates(DefBB, BB)) {
-      // Split edge to make Def dominate Use
-      FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+  } else {
+    // We have a uniform conditional branch terminating the block.
+    // THis block may be the last in the Then path of the enclosing divergent
+    // IF.
+    if (!isUniform(Term))
+      // Divergent loop is going to be further processed in another place
+      return false;
+
+    for (auto Succ : Term->successors()) {
+      if (isTopOfStack(Succ)) {
+        // Just split to make a room for further WAVE_RECONVERGE insertion
+        SmallVector<BasicBlock*, 2> Preds;
+        for (auto P : predecessors(Succ)) {
+          if (DT->dominates(BB, P))
+            Preds.push_back(P);
+        }
+        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+        SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI,
+                                            nullptr, false);
+      }
     }
-    IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
-    // TODO: StructurizeCFG 'Flow' blocks have debug locations from the
-    // condition, for now just avoid copying these DebugLocs so that stepping
-    // out of the then/else block in a debugger doesn't step to the condition.
-    IRB.SetCurrentDebugLocation(DebugLoc());
-    IRB.CreateCall(EndCf, {Exec});
   }
 
   return true;
@@ -365,14 +364,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
 
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(BB))
-        Changed |= closeControlFlow(BB);
+        Stack.pop_back();
+
+      Changed |= tryWaveReconverge(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(BB))
-        Changed |= closeControlFlow(BB);
+        Stack.pop_back();
+
+      // Let's take care of uniform loop latch that may be closing the Then
+      // path of the enclosing divergent branch.
+      Changed |= tryWaveReconverge(BB);
 
       if (DT->dominates(Term->getSuccessor(1), BB))
         Changed |= handleLoop(Term);
@@ -387,10 +392,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
         continue;
       }
 
-      Changed |= closeControlFlow(BB);
+      Stack.pop_back();
     }
 
-    Changed |= openIf(Term);
+    if (isUniform(Term))
+      // Uniform conditional branch may be in the block that closes the Then
+      // path of the divergent conditional branch.
+      Changed |= tryWaveReconverge(BB);
+    else
+      Changed |= openIf(Term);
   }
 
   if (!Stack.empty()) {

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
       return AMDGPUISD::ELSE;
     case Intrinsic::amdgcn_loop:
       return AMDGPUISD::LOOP;
-    case Intrinsic::amdgcn_end_cf:
+    case Intrinsic::amdgcn_wave_reconverge:
       llvm_unreachable("should not occur");
     default:
       return 0;
@@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     return SDValue(Load, 0);
   }
-  case Intrinsic::amdgcn_end_cf:
-    return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
+  case Intrinsic::amdgcn_wave_reconverge:
+    return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other,
                                       Op->getOperand(2), Chain), 0);
   case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_join:
@@ -15740,6 +15740,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
     }
   }
 
+  // ISel inserts copy to regs for the successor PHIs
+  // at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
+  // branch.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
+        MachineBasicBlock::iterator I(MI);
+        MachineBasicBlock::iterator Next = std::next(I);
+        bool NeedToMove = false;
+        while (Next != MBB.end() && !Next->isBranch()) {
+          NeedToMove = true;
+          Next++;
+        }
+
+        assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
+               "Malformed CFG detected!\n");
+
+        if (NeedToMove) {
+          MBB.splice(Next, &MBB, &MI);
+        }
+
+        break;
+      }
+    }
+  }
+
   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
   // classes if required. Ideally the register class constraints would differ
   // per-subtarget, but there's no easy way to achieve that right now. This is
@@ -16336,7 +16362,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
         default:
           Result = false;
           break;
-        case Intrinsic::amdgcn_end_cf:
+        case Intrinsic::amdgcn_wave_reconverge:
         case Intrinsic::amdgcn_loop:
           Result = true;
           break;

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_MOV_B64));
     break;
 
+  case AMDGPU::S_CMOV_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_CMOV_B64));
+    break;
+
   case AMDGPU::S_MOV_B32_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
     MI.setDesc(get(AMDGPU::S_MOV_B32));
     break;
 
+  case AMDGPU::S_CMOV_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_CMOV_B32));
+    break;
+
+  case AMDGPU::S_CSELECT_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_CSELECT_B32));
+    break;
+
+  case AMDGPU::S_CSELECT_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_CSELECT_B64));
+    break;
+
   case AMDGPU::S_XOR_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -3088,20 +3112,25 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   while (I != E && !I->isBranch() && !I->isReturn()) {
     switch (I->getOpcode()) {
     case AMDGPU::S_MOV_B64_term:
+    case AMDGPU::S_CMOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_OR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
     case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_AND_SAVEEXEC_B64_term:
+    case AMDGPU::S_CSELECT_B64_term:
     case AMDGPU::S_MOV_B32_term:
+    case AMDGPU::S_CMOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
     case AMDGPU::S_AND_B32_term:
     case AMDGPU::S_AND_SAVEEXEC_B32_term:
+    case AMDGPU::S_CSELECT_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
+    case AMDGPU::SI_WAVE_RECONVERGE:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
@@ -8782,7 +8811,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
             .add(Branch->getOperand(0))
             .add(Branch->getOperand(1));
     MachineInstr *SIEND =
-        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_WAVE_RECONVERGE))
             .addReg(DstReg);
 
     IfEntry->erase(TI);

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -350,6 +350,8 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
 
 let WaveSizePredicate = isWave64 in {
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
+def S_CMOV_B64_term : WrapTerminatorInst<S_CMOV_B64>;
+def S_CSELECT_B64_term : WrapTerminatorInst<S_CSELECT_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
@@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
 
 let WaveSizePredicate = isWave32 in {
 def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
+def S_CMOV_B32_term : WrapTerminatorInst<S_CMOV_B32>;
+def S_CSELECT_B32_term : WrapTerminatorInst<S_CSELECT_B32>;
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
@@ -475,9 +479,7 @@ def SI_LOOP : CFPseudoInstSI <
   let IsNeverUniform = 1;
 }
 
-} // End isTerminator = 1
-
-def SI_END_CF : CFPseudoInstSI <
+def SI_WAVE_RECONVERGE : CFPseudoInstSI <
   (outs), (ins SReg_1:$saved), [], 1, 1> {
   let Size = 4;
   let isAsCheapAsAMove = 1;
@@ -488,6 +490,8 @@ def SI_END_CF : CFPseudoInstSI <
   let mayStore = 1;
 }
 
+} // End isTerminator = 1
+
 def SI_IF_BREAK : CFPseudoInstSI <
   (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
   let Size = 4;