-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][AMDGPUDemoteSCCBranchToExecz] demote s_cbranch_scc0/1 branches into vcmp + s_cbranch_execz branches #110284
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Juan Manuel Martinez Caamaño (jmmartinez) ChangesFollow up from #109818, only the last 2 commits matter Patch is 73.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110284.diff 28 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4abb5a63ab6d2c..0eb415e66fe170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -369,6 +369,17 @@ extern char &AMDGPUCodeGenPrepareID;
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
extern char &AMDGPURemoveIncompatibleFunctionsID;
+void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &);
+extern char &AMDGPUDemoteSCCBranchToExeczLegacyID;
+
+class AMDGPUDemoteSCCBranchToExeczPass
+ : public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
+public:
+ AMDGPUDemoteSCCBranchToExeczPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
new file mode 100644
index 00000000000000..f5987cc629ce8d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
@@ -0,0 +1,306 @@
+#include <llvm/CodeGen/MachineFunctionPass.h>
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
+const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
+
+std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
+ bool Reverse = false) {
+ unsigned Opc = MI.getOpcode();
+ if (Reverse) {
+ switch (Opc) {
+ case AMDGPU::S_CMP_EQ_I32:
+ Opc = AMDGPU::S_CMP_LG_I32;
+ break;
+ case AMDGPU::S_CMP_LG_I32:
+ Opc = AMDGPU::S_CMP_EQ_I32;
+ break;
+ case AMDGPU::S_CMP_GT_I32:
+ Opc = AMDGPU::S_CMP_LE_I32;
+ break;
+ case AMDGPU::S_CMP_GE_I32:
+ Opc = AMDGPU::S_CMP_LT_I32;
+ break;
+ case AMDGPU::S_CMP_LT_I32:
+ Opc = AMDGPU::S_CMP_GE_I32;
+ break;
+ case AMDGPU::S_CMP_LE_I32:
+ Opc = AMDGPU::S_CMP_GT_I32;
+ break;
+ case AMDGPU::S_CMP_EQ_U32:
+ Opc = AMDGPU::S_CMP_LG_U32;
+ break;
+ case AMDGPU::S_CMP_LG_U32:
+ Opc = AMDGPU::S_CMP_EQ_U32;
+ break;
+ case AMDGPU::S_CMP_GT_U32:
+ Opc = AMDGPU::S_CMP_LE_U32;
+ break;
+ case AMDGPU::S_CMP_GE_U32:
+ Opc = AMDGPU::S_CMP_LT_U32;
+ break;
+ case AMDGPU::S_CMP_LT_U32:
+ Opc = AMDGPU::S_CMP_GE_U32;
+ break;
+ case AMDGPU::S_CMP_LE_U32:
+ Opc = AMDGPU::S_CMP_GT_U32;
+ break;
+ case AMDGPU::S_CMP_EQ_U64:
+ Opc = AMDGPU::S_CMP_LG_U64;
+ break;
+ case AMDGPU::S_CMP_LG_U64:
+ Opc = AMDGPU::S_CMP_EQ_U64;
+ break;
+ default:
+ return std::nullopt;
+ }
+ }
+
+ switch (Opc) {
+ case AMDGPU::S_CMP_EQ_I32:
+ return AMDGPU::V_CMP_EQ_I32_e64;
+ case AMDGPU::S_CMP_LG_I32:
+ return AMDGPU::V_CMP_LT_I32_e64;
+ case AMDGPU::S_CMP_GT_I32:
+ return AMDGPU::V_CMP_GT_I32_e64;
+ case AMDGPU::S_CMP_GE_I32:
+ return AMDGPU::V_CMP_GE_I32_e64;
+ case AMDGPU::S_CMP_LT_I32:
+ return AMDGPU::V_CMP_LT_I32_e64;
+ case AMDGPU::S_CMP_LE_I32:
+ return AMDGPU::V_CMP_LE_I32_e64;
+ case AMDGPU::S_CMP_EQ_U32:
+ return AMDGPU::V_CMP_EQ_U32_e64;
+ case AMDGPU::S_CMP_LG_U32:
+ return AMDGPU::V_CMP_NE_U32_e64;
+ case AMDGPU::S_CMP_GT_U32:
+ return AMDGPU::V_CMP_GT_U32_e64;
+ case AMDGPU::S_CMP_GE_U32:
+ return AMDGPU::V_CMP_GE_U32_e64;
+ case AMDGPU::S_CMP_LT_U32:
+ return AMDGPU::V_CMP_LT_U32_e64;
+ case AMDGPU::S_CMP_LE_U32:
+ return AMDGPU::V_CMP_LE_U32_e64;
+ case AMDGPU::S_CMP_EQ_U64:
+ return AMDGPU::V_CMP_EQ_U64_e64;
+ case AMDGPU::S_CMP_LG_U64:
+ return AMDGPU::V_CMP_NE_U64_e64;
+ default:
+ return std::nullopt;
+ }
+}
+
+bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
+ return getVALUOpc(MI).has_value();
+}
+
+bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+ if (Head.succ_size() != 2)
+ return false;
+
+ Then = Head.succ_begin()[0];
+ Tail = Head.succ_begin()[1];
+
+ // Canonicalize so Succ0 has MBB as its single predecessor.
+ if (Then->pred_size() != 1)
+ std::swap(Then, Tail);
+
+ if (Then->pred_size() != 1 || Then->succ_size() != 1)
+ return false;
+
+ return *Then->succ_begin() == Tail;
+}
+
+bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
+ auto CmpIt = std::next(Term.getReverseIterator());
+ if (CmpIt == Term.getParent()->instr_rend())
+ return false;
+
+ if (!isSCmpPromotableToVCmp(*CmpIt))
+ return false;
+
+ Cmp = &*CmpIt;
+ return true;
+}
+
+bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
+ auto TermIt = Head.getFirstInstrTerminator();
+ if (TermIt == Head.end())
+ return false;
+
+ switch (TermIt->getOpcode()) {
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1:
+ Term = &*TermIt;
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+
+ if (!hasCbranchSCCTerm(Head, Term))
+ return false;
+
+ if (!isTriangular(Head, Then, Tail))
+ return false;
+
+ // phi-nodes in the tail can prevent splicing the instructions of the then
+ // and tail blocks in the head
+ if (!Tail->empty() && Tail->begin()->isPHI())
+ return false;
+
+ if (!hasPromotableCmpConditon(*Term, Cmp))
+ return false;
+
+ return true;
+}
+
+bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
+ MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
+ return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
+}
+
+class AMDGPUDemoteSCCBranchToExecz {
+ MachineFunction &MF;
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &RegInfo;
+ const TargetSchedModel &SchedModel;
+
+public:
+ AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
+
+ bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
+ const MachineBasicBlock &Then,
+ const MachineBasicBlock &Tail) {
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
+ AndSaveExecOpc, Mov};
+ unsigned NewOpsCost = 0;
+ for (unsigned Opc : NewOps)
+ NewOpsCost += SchedModel.computeInstrLatency(Opc);
+ unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
+
+ assert(NewOpsCost >= OldCmpCost);
+ return !TII.mustRetainExeczBranch(*Term.getParent(), Then, Tail,
+ NewOpsCost - OldCmpCost);
+ }
+
+ void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
+ unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
+ Cmp.setDesc(TII.get(NewCmpOpc));
+
+ MachineOperand L = Cmp.getOperand(0);
+ MachineOperand R = Cmp.getOperand(1);
+ for (unsigned i = 3; i != 0; --i)
+ Cmp.removeOperand(i - 1);
+
+ auto VCC = RegInfo.getVCC();
+ auto Exec = RegInfo.getExec();
+
+ auto &MRI = MF.getRegInfo();
+ MCRegister ExecBackup =
+ MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
+
+ Cmp.addOperand(MachineOperand::CreateReg(VCC, true));
+ Cmp.addOperand(L);
+ Cmp.addOperand(R);
+ Cmp.addImplicitDefUseOperands(MF);
+
+ TII.legalizeOperands(Cmp);
+
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
+ TII.get(AndSaveExecOpc), ExecBackup);
+ SaveAndMaskExec.addReg(VCC, RegState::Kill);
+ SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
+
+ DebugLoc DL = Term.getDebugLoc();
+ TII.removeBranch(Head);
+ MachineOperand Cond[] = {
+ MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
+ MachineOperand::CreateReg(RegInfo.getExec(), false)};
+ TII.insertBranch(Head, &Tail, &Then, Cond, DL);
+
+ TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
+ }
+
+ bool run() {
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+ bool Changed = false;
+
+ for (MachineBasicBlock &Head : MF) {
+ MachineInstr *Term;
+ MachineInstr *Cmp;
+ MachineBasicBlock *Then;
+ MachineBasicBlock *Tail;
+ if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
+ continue;
+
+ if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
+ continue;
+
+ demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
+ Changed = true;
+ }
+ return Changed;
+ }
+};
+
+class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+ return IfCvt.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return PassName; }
+};
+
+char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
+
+} // namespace
+
+PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
+ MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+ if (!IfCvt.run())
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+char &llvm::AMDGPUDemoteSCCBranchToExeczLegacyID =
+ AMDGPUDemoteSCCBranchToExeczLegacy::ID;
+INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
+INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0ebf34c901c142..d968ac61eea39d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -95,6 +95,7 @@ FUNCTION_PASS_WITH_PARAMS(
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-demote-scc-to-execz", AMDGPUDemoteSCCBranchToExeczPass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index abd50748f2cc05..2c9cc95bf8b0bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -444,6 +444,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
+ initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@@ -1283,7 +1284,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
bool GCNPassConfig::addILPOpts() {
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);
-
+ addPass(&AMDGPUDemoteSCCBranchToExeczLegacyID);
TargetPassConfig::addILPOpts();
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7c883cc2017ddd..180fb7ac0c5153 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
+ AMDGPUDemoteSCCBranchToExecz.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 44ee5c56a237b4..24fe598607688e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4115,6 +4115,96 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
}
+namespace {
+class BranchWeightCostModel {
+ const SIInstrInfo &TII;
+ const TargetSchedModel &SchedModel;
+ BranchProbability BranchProb;
+ uint64_t BranchCost;
+ uint64_t ThenCyclesCost;
+
+public:
+ BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
+ const MachineBasicBlock &Succ,
+ unsigned ExtraTransformationCosts = 0)
+ : TII(TII), SchedModel(TII.getSchedModel()),
+ ThenCyclesCost(ExtraTransformationCosts) {
+ assert(SchedModel.hasInstrSchedModelOrItineraries());
+
+ const MachineBasicBlock &Head = *Branch.getParent();
+ const auto *FromIt = find(Head.successors(), &Succ);
+ assert(FromIt != Head.succ_end());
+
+ BranchProb = Head.getSuccProbability(FromIt);
+ if (BranchProb.isUnknown())
+ return;
+
+ BranchCost = SchedModel.computeInstrLatency(&Branch, false);
+ }
+
+ bool isUnknown() const { return BranchProb.isUnknown(); }
+
+ bool isProfitable(const MachineInstr &MI) {
+ assert(!isUnknown());
+
+ if (TII.isWaitcnt(MI.getOpcode()))
+ return false;
+
+ ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
+
+ // Consider `P = N/D` to be the probability of execnz being true
+ // The transformation is profitable if always executing the 'then' block
+ // is cheaper than executing sometimes 'then' and always
+ // executing s_cbranch_execnz:
+ // * ThenCost + Extra <= P*ThenCost + BranchCost
+ // * (1-P) * (ThenCost + Extra) <= BranchCost
+ // * (D-N)/D * (ThenCost + Extra) <= BranchCost
+ uint64_t Numerator = BranchProb.getNumerator();
+ uint64_t Denominator = BranchProb.getDenominator();
+ return (Denominator - Numerator) * ThenCyclesCost <=
+ Denominator * BranchCost;
+ }
+};
+} // namespace
+
+bool SIInstrInfo::mustRetainExeczBranch(
+ const MachineBasicBlock &Head, const MachineBasicBlock &From,
+ const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
+
+ const auto *FromIt = find(Head.successors(), &From);
+ assert(FromIt != Head.succ_end());
+
+ BranchWeightCostModel CostModel{*this, *Head.getFirstTerminator(), From,
+ ExtraTransformationCosts};
+ if (CostModel.isUnknown())
+ return true;
+
+ const MachineFunction *MF = From.getParent();
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (const MachineInstr &MI : MBB) {
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (MI.isConditionalBranch())
+ return true;
+
+ if (MI.isMetaInstruction())
+ continue;
+
+ if (hasUnwantedEffectsWhenEXECEmpty(MI))
+ return true;
+
+ if (!CostModel.isProfitable(MI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7554906a9c98b..78d75389f5a866 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
+public:
// The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
@@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
EXECZ = 3
};
+private:
using SetVectorType = SmallSetVector<MachineInstr *, 32>;
static unsigned getBranchOpcode(BranchPredicate Cond);
@@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Return true if the instruction modifies the mode register.q
static bool modifiesModeRegister(const MachineInstr &MI);
+ /// Returns true if it's protifable to remove an execz branch from Head to
+ /// From
+ bool mustRetainExeczBranch(const MachineBasicBlock &Head,
+ const MachineBasicBlock &From,
+ const MachineBasicBlock &To,
+ unsigned ExtraTransformationCosts = 0) const;
+
/// This function is used to determine if an instruction can be safely
/// executed under EXEC = 0 without hardware error, indeterminate results,
/// and/or visible effects on future vector execution or outside the shader.
- /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
- /// used in removing branches over short EXEC = 0 sequences.
- /// As such it embeds certain assumptions which may not apply to every case
- /// of EXEC = 0 execution.
+ /// Note: as of 2024 the only use of this is SIPreEmitPeephole and
+ /// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
+ /// where it is used in removing branches over short EXEC = 0 sequences. As
+ /// such it embeds certain assumptions which may not apply to every case of
+ /// EXEC = 0 execution.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
/// Returns true if the instruction could potentially depend on the value of
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 1334029544f999..1c71c245ed79d0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -15,19 +15,12 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSchedule.h"
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
- "amdgpu-skip-threshold", cl::Hidden,
- cl::desc(
- "Number of instructions before jumping over divergent control flow"),
- cl::location(SkipThreshold), cl::init(12));
-
namespace {
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -41,8 +34,6 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
public:
@@ -304,45 +295,13 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}
-bool SIPreEmitPeephole::mustRetainExeczBranch(
- const MachineBasicBlock &From, const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (const MachineInstr &MI : MBB) {
- // When a uniform loop is inside non-uniform c...
[truncated]
|
6f1d22d
to
4493290
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why would you want to do this?
INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName, | ||
false, false) | ||
INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName, | ||
false, false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing newline at end of file
60c8833
to
dc24dfa
Compare
I've updated the PR with an explanation on the first comment. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My intuition is that something has gone wrong upstream of this if this is a profitable transformation. Do you have a more complete example of something this helps?
} | ||
} | ||
|
||
bool isSCmpPromotableToVCmp(const MachineInstr &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These functions are in an anonymous namespace.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm style is to prefer static to anonymous namespace
case AMDGPU::S_CMP_LG_U64: | ||
return AMDGPU::V_CMP_NE_U64_e64; | ||
default: | ||
return std::nullopt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All compare types are present in both, there is no failure case
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't cover any of the float-point S_CMP instructions for the moment. And currently I'm using the failure case for another purpose: verify that the instruction before the s_cbranch_scc
is a s_cmp
we can handle.
I've changed this to have a shorter-less-error-prone code using macros.
return getVALUOpc(MI).has_value(); | ||
} | ||
|
||
bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you do this with analyzeBranch instead of recreating the logic
…ch_scc branches into vcmp + s_cbranch_execz branches
dc24dfa
to
4626fb4
Compare
I'm going to check what's going on with the whole CK kernel to give some more context. I'd expect all the branches to be lowered to pseudo instructions and then to the actual instruction in |
} | ||
} | ||
|
||
bool isSCmpPromotableToVCmp(const MachineInstr &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These functions are in an anonymous namespace.
case AMDGPU::S_CMP_LG_U64: | ||
return AMDGPU::V_CMP_NE_U64_e64; | ||
default: | ||
return std::nullopt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't cover any of the float-point S_CMP instructions for the moment. And currently I'm using the failure case for another purpose: verify that the instruction before the s_cbranch_scc
is a s_cmp
we can handle.
I've changed this to have a shorter-less-error-prone code using macros.
return false; | ||
|
||
bool SCCIsUsedOutsideHead = any_of( | ||
Head.liveouts(), [](const auto &P) { return P.PhysReg == AMDGPU::SCC; }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add profitable test case where this condition is true
.
Follow up from #109818, only the last 2 commits matter
This is the last commit to solve SWDEV-477895
The motivation of this transformation is to convert this kind of s_cmp + scc branch:
Into a branchless v_cmpx on gfx1030:
Into a branchless v_cmp on others:
To do this:
v_cmp
is transformed tov_cmpx
bySIOptimizeExecMasking::optimizeVCMPSaveExecSequence
. Otherwise we stay with the thev_cmp
sequence.s_cbranch_execz
are removed bySIPreEmitPeephole