diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 3dc8cc17afc16c..df7906ebd8a7ea 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -219,11 +219,12 @@ class SIWholeQuadMode : public MachineFunctionPass { void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); - void lowerLiveMaskQueries(); - void lowerCopyInstrs(); - void lowerKillInstrs(bool IsWQM); + bool lowerLiveMaskQueries(); + bool lowerCopyInstrs(); + bool lowerKillInstrs(bool IsWQM); void lowerInitExec(MachineInstr &MI); - MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry); + MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry, + bool &Changed); public: static char ID; @@ -796,6 +797,8 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI) { + assert(LiveMaskReg.isVirtual()); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Opcode = 0; @@ -913,6 +916,8 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM) { + assert(LiveMaskReg.isVirtual()); + const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *MaskUpdateMI = nullptr; @@ -1144,6 +1149,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SaveWQM) { + assert(LiveMaskReg.isVirtual()); + bool IsTerminator = Before == MBB.end(); if (!IsTerminator) { auto FirstTerm = MBB.getFirstTerminator(); @@ -1423,7 +1430,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { assert(!SavedNonStrictReg); } -void SIWholeQuadMode::lowerLiveMaskQueries() { +bool SIWholeQuadMode::lowerLiveMaskQueries() { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -1435,9 +1442,10 @@ void SIWholeQuadMode::lowerLiveMaskQueries() { LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); } + return !LiveMaskQueries.empty(); } -void SIWholeQuadMode::lowerCopyInstrs() { +bool SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToMovInstrs) { assert(MI->getNumExplicitOperands() == 2); @@ -1492,9 +1500,10 @@ void SIWholeQuadMode::lowerCopyInstrs() { *MRI, MI->getOperand(0))); MI->setDesc(TII->get(CopyOp)); } + return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } -void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { +bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { for (MachineInstr *MI : KillInstrs) { MachineBasicBlock *MBB = MI->getParent(); MachineInstr *SplitPoint = nullptr; @@ -1510,6 +1519,7 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { if (SplitPoint) splitBlock(MBB, SplitPoint); } + return !KillInstrs.empty(); } void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { @@ -1601,7 +1611,7 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry /// for instructions that depend on EXEC. MachineBasicBlock::iterator -SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { +SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) { MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); for (MachineInstr *MI : InitExecInstrs) { @@ -1612,6 +1622,7 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { InsertPt = std::next(MI->getIterator()); lowerInitExec(*MI); + Changed = true; } return InsertPt; @@ -1664,48 +1675,50 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { } const char GlobalFlags = analyzeFunction(MF); - const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); + bool Changed = false; LiveMaskReg = Exec; MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry); - - // Shader is simple does not need any state changes or any complex lowering - if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && - LowerToMovInstrs.empty() && KillInstrs.empty()) { - lowerLiveMaskQueries(); - if (!InitExecInstrs.empty()) - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); - } + MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); // Store a copy of the original live mask when required - if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + const bool HasLiveMaskQueries = !LiveMaskQueries.empty(); + const bool HasWaveModes = GlobalFlags & ~StateExact; + const bool HasKills = !KillInstrs.empty(); + const bool UsesWQM = GlobalFlags & StateWQM; + if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) { LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); + Changed = true; } LLVM_DEBUG(printInfo()); - lowerLiveMaskQueries(); - lowerCopyInstrs(); + Changed |= lowerLiveMaskQueries(); + Changed |= lowerCopyInstrs(); - // Shader only needs WQM - if (GlobalFlags == StateWQM) { + if (!HasWaveModes) { + // No wave mode execution + Changed |= lowerKillInstrs(false); + } else if (GlobalFlags == StateWQM) { + // Shader only needs WQM auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); lowerKillInstrs(true); + Changed = true; } else { + // Wave mode switching requires full lowering pass. for (auto BII : Blocks) processBlock(*BII.first, BII.first == &Entry); // Lowering blocks causes block splitting so perform as a second pass. for (auto BII : Blocks) lowerBlock(*BII.first); + Changed = true; } // Compute live range for live mask @@ -1721,5 +1734,5 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (!KillInstrs.empty() || !InitExecInstrs.empty()) LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - return true; + return Changed; }