-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864
base: users/petar-avramovic/new-rbs-rb-select
Are you sure you want to change the base?
AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. Join @petar-avramovic and the rest of your teammates on Graphite |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesLower G_ instructions that can't be inst-selected with register bank
Given LLTs on all operands after legalizer, some register bank RBLegalize goals:
Since this is the first patch that actually enables -new-reg-bank-select
Patch is 140.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112864.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 6f6ad5cf82cae1..244d58c2fd0810 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
+
+MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc});
+ Register Dst = RFL->getOperand(0).getReg();
+ Register Src = RFL->getOperand(1).getReg();
+ MachineRegisterInfo &MRI = *B.getMRI();
+ if (!MRI.getRegBankOrNull(Dst))
+ MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID));
+ if (!MRI.getRegBankOrNull(Src))
+ MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID));
+ return RFL;
+}
+
+MachineInstrBuilder
+AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc, LLT B32Ty,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ SmallVector<Register, 8> SgprDstParts;
+ auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc);
+ for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
+ SgprDstParts.push_back(
+ buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0));
+ }
+
+ auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+ MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ return Merge;
+}
+
+MachineInstrBuilder
+AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ MachineRegisterInfo &MRI = *B.getMRI();
+ SmallVector<Register, 8> SgprDstParts;
+ auto Unmerge = B.buildUnmerge(S64, VgprSrc);
+
+ for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
+ MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID));
+ auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i));
+ SmallVector<Register, 2> Unmerge64Parts;
+ Unmerge64Parts.push_back(
+ buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0));
+ Unmerge64Parts.push_back(
+ buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0));
+ Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0);
+ MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ SgprDstParts.push_back(MergeReg);
+ }
+
+ auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+ MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ return Merge;
+}
+
+MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT S16 = LLT::scalar(16);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ LLT S256 = LLT::scalar(256);
+ LLT V2S16 = LLT::fixed_vector(2, 16);
+ LLT Ty = SgprDst.getLLTTy(MRI);
+
+ if (Ty == S16) {
+ return B.buildTrunc(
+ SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI));
+ }
+
+ if (Ty == S32 || Ty == V2S16 ||
+ (Ty.isPointer() && Ty.getSizeInBits() == 32)) {
+ return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI);
+ }
+
+ if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) ||
+ (Ty.isVector() && Ty.getElementType() == S32)) {
+ return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI);
+ }
+
+ if (Ty.isVector() && Ty.getElementType() == S16) {
+ return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI);
+ }
+
+ if (Ty.isVector() && Ty.getElementType() == S64) {
+ return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI);
+ }
+
+ llvm_unreachable("Type not supported");
+}
+
+void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst);
+ if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID))
+ return;
+
+ Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
+ MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID));
+
+ MI.getOperand(0).setReg(VgprDst);
+ MachineBasicBlock *MBB = MI.getParent();
+ B.setInsertPt(*MBB, std::next(MI.getIterator()));
+ // readAnyLane VgprDst into Dst after MI.
+ buildReadAnyLane(B, Dst, VgprDst, RBI);
+ return;
+}
+
+bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == VCCRegBankID)
+ return true;
+
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (RC && TRI->isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1))
+ return true;
+
+ return false;
+}
+
+bool AMDGPU::isSgprRB(Register Reg, MachineRegisterInfo &MRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == SGPRRegBankID)
+ return true;
+
+ return false;
+}
+
+bool AMDGPU::isVgprRB(Register Reg, MachineRegisterInfo &MRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == VGPRRegBankID)
+ return true;
+
+ return false;
+}
+
+void AMDGPU::cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineInstr *Optional0) {
+ MI.eraseFromParent();
+ if (Optional0 && isTriviallyDead(*Optional0, MRI))
+ Optional0->eraseFromParent();
+}
+
+bool AMDGPU::hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI) {
+ for (auto &MBB : MF) {
+ for (auto &MI : make_early_inc_range(MBB)) {
+ for (MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (!Reg.isVirtual())
+ continue;
+
+ if (!isSgprRB(Reg, MRI) || MRI.getType(Reg) != LLT::scalar(1))
+ continue;
+
+ MI.getParent()->dump();
+ MI.dump();
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool AMDGPU::isS1(Register Reg, MachineRegisterInfo &MRI) {
+ return MRI.getType(Reg) == LLT::scalar(1);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 4d504d0204d81a..bf812dd86fbd04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,7 +9,11 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
+#include "AMDGPURegisterBankInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
@@ -48,7 +52,58 @@ class IntrinsicLaneMaskAnalyzer {
// This will not be needed when we turn of LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
+
+void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ LLT B32Ty,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+// Create new vgpr destination register for MI then move it to current
+// MI's sgpr destination using one or more G_READANYLANE instructions.
+void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI);
+
+// Share with SIRegisterInfo::isUniformReg? This could make uniformity info give
+// same result in later passes.
+bool isLaneMask(Register Reg, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI);
+
+bool isSgprRB(Register Reg, MachineRegisterInfo &MRI);
+
+bool isVgprRB(Register Reg, MachineRegisterInfo &MRI);
+
+template <typename SrcTy>
+inline MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>
+m_GReadAnyLane(const SrcTy &Src) {
+ return MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>(Src);
}
-}
+
+void cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineInstr *Optional0 = nullptr);
+
+bool hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI);
+
+bool isS1(Register Reg, MachineRegisterInfo &MRI);
+
+} // namespace AMDGPU
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 800bdbe04cf70d..3e1a78050c8a2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -217,6 +217,75 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ unsigned CmpOpc =
+ STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+ MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
+ .addReg(I.getOperand(1).getReg())
+ .addImm(0);
+ if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
+ return false;
+
+ Register DstReg = I.getOperand(0).getReg();
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI);
+}
+
+bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ std::optional<ValueAndVReg> Arg =
+ getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
+
+ if (Arg) {
+ const int64_t Value = Arg->Value.getZExtValue();
+ if (Value == 0) {
+ unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
+ } else {
+ assert(Value == 1);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TRI.getExec());
+ }
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
+ }
+
+ // RBLegalize was ensures that SrcReg is bool in reg (high bits are 0).
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
+
+ unsigned SelectOpcode =
+ STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .addReg(TRI.getExec())
+ .addImm(0);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(SrcReg);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +318,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
}
- // TODO: Verify that all registers have the same bank
+ // If inputs have register bank, assign corresponding reg class.
+ // Note: registers don't need to have the same reg bank.
+ for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
+ const Register SrcReg = I.getOperand(i).getReg();
+
+ const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
+ if (RB) {
+ const LLT SrcTy = MRI->getType(SrcReg);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForTypeOnBank(SrcTy, *RB);
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ return false;
+ }
+ }
+
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
@@ -3656,6 +3739,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectStackRestore(I);
case AMDGPU::G_PHI:
return selectPHI(I);
+ case AMDGPU::G_COPY_SCC_VCC:
+ return selectCOPY_SCC_VCC(I);
+ case AMDGPU::G_COPY_VCC_SCC:
+ return selectCOPY_VCC_SCC(I);
+ case AMDGPU::G_READANYLANE:
+ return selectReadAnyLane(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..11bba12499f0ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
bool selectCOPY(MachineInstr &I) const;
+ bool selectCOPY_SCC_VCC(MachineInstr &I) const;
+ bool selectCOPY_VCC_SCC(MachineInstr &I) const;
+ bool selectReadAnyLane(MachineInstr &I) const;
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
index 9a9722559377f6..7c348bf759cadc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
@@ -18,7 +18,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPURBLegalizeHelper.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "rb-legalize"
@@ -41,6 +47,9 @@ class AMDGPURBLegalize : public MachineFunctionPass {
StringRef getPassName() const override { return "AMDGPU RB Legalize"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -56,6 +65,9 @@ class AMDGPURBLegalize : public MachineFunctionPass {
INITIALIZE_PASS_BEGIN(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false,
false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false,
false)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() {
using namespace AMDGPU;
+const RegBankLegalizeRules &getRules(const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI) {
+ static std::mutex GlobalMutex;
+ static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>>
+ CacheForRuleSet;
+ std::lock_guard<std::mutex> Lock(GlobalMutex);
+ if (!CacheForRuleSet.contains(ST.getGeneration())) {
+ auto Rules = std::make_unique<RegBankLegalizeRules>(ST, MRI);
+ CacheForRuleSet[ST.getGeneration()] = std::move(Rules);
+ } else {
+ CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI);
+ }
+ return *CacheForRuleSet[ST.getGeneration()];
+}
+
bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Setup the instruction builder with CSE.
+ std::unique_ptr<MachineIRBuilder> MIRBuilder;
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ GISelCSEInfo *CSEInfo = nullptr;
+ GISelObserverWrapper Observer;
+
+ if (TPC.isGISelCSEEnabled()) {
+ MIRBuilder = std::make_unique<CSEMIRBuilder>();
+ CSEInfo = &Wrapper.get(TPC.getCSEConfig());
+ MIRBuilder->setCSEInfo(CSEInfo);
+ Observer.addObserver(CSEInfo);
+ MIRBuilder->setChangeObserver(Observer);
+ } else {
+ MIRBuilder = std::make_unique<MachineIRBuilder>();
+ }
+ MIRBuilder->setMF(MF);
+
+ RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
+ RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
+
+ const MachineUniformityInfo &MUI =
+ getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
+ const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();
+
+ // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes.
+ const RegBankLegalizeRules &RBLRules = getRules(ST, MRI);
+
+ // Logic that does legalization based on IDs assigned to Opcode.
+ RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules);
+
+ SmallVector<MachineInstr *> AllInst;
+
+ for (auto &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ AllInst.push_back(&MI);
+ }
+ }
+
+ for (auto &MI : AllInst) {
+ if (!MI->isPreISelOpcode())
+ continue;
+
+ unsigned Opc = MI->getOpcode();
+
+ // Insert point for use operands needs some calculation.
+ if (Opc == G_PHI) {
+ RBLegalizeHelper.applyMappingPHI(*MI);
+ continue;
+ }
+
+ // Opcodes that support pretty much all combinations of reg banks and LLTs
+ // (except S1). There is no point in writing rules for them.
+ if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES ||
+ Opc == G_MERGE_VALUES) {
+ RBLegalizeHelper.applyMappingTrivial(*MI);
+ continue;
+ }
+
+ // Opcodes that also support S1. S1 rules are in RegBankLegalizeRules.
+ // Remaining reg bank and LLT combinations are trivially accepted.
+ if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) &&
+ !isS1(MI->getOperand(0).getReg(), MRI)) {
+ assert(isSgprRB(MI->getOperand(0).getReg(), MRI));
+ continue;
+ }
+
+ if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) {
+ MI->dump();
+ llvm_unreachable("failed to match any of the rules");
+ }
+ }
+
+ LLT S1 = LLT::scalar(1);
+ LLT S16 = LLT::scalar(16...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
3bd3785
to
fbaf393
Compare
fbaf393
to
921a702
Compare
auto Unmerge = B.buildUnmerge(S64, VgprSrc); | ||
|
||
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { | ||
MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the direct VGPRRegBank pointer or pull this out of the loop
const DstOp &SgprDst, | ||
const SrcOp &VgprSrc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SrcOp / DstOp are for MachineIRBuilder, and other code probably shouldn't be using them
if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || | ||
(Ty.isVector() && Ty.getElementType() == S32)) { | ||
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); | ||
} | ||
|
||
if (Ty.isVector() && Ty.getElementType() == S16) { | ||
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); | ||
} | ||
|
||
if (Ty.isVector() && Ty.getElementType() == S64) { | ||
return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you simply this into one isLegalType predicate? This is just expanding out the 32-bit LCM type?
if (Slot != -1) { | ||
if (MUI.isUniform(Reg)) | ||
return Uni[Slot]; | ||
else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return
// Note: if fast rules are enabled, RegBankLLTMapping must be added in each | ||
// slot that could "match fast Predicate". If not, Invalid Mapping is | ||
// returned which results in failure, does not search "Slow Rules". | ||
if (FastTypes != No) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"No"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed to NoFastRules
int SetOfRulesForOpcode::getFastPredicateSlot( | ||
UniformityLLTOpPredicateID Ty) const { | ||
switch (FastTypes) { | ||
case Standard: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add braces
@@ -0,0 +1,258 @@ | |||
//===- AMDGPURBLegalizeRules -------------------------------------*- C++ -*-==// |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to share with the existing legalize rules?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't think sharing is good option for this patch. RBLegalizeRules are much more flexible and I would assume faster because of "FastPredicateSlot".
If we add more IDs that would work with LLTs only we could rewrite Legalizer using RBLegalizeRules. Other way around is questionable, did not consider upgrading LegalityPredicate and LegalizeMutation to work with Register banks
if (!MRI.getRegBankOrNull(Dst)) | ||
MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); | ||
if (!MRI.getRegBankOrNull(Src)) | ||
MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should add a contrainRegBank method to MRI, similar to constrainRegClass for this pattern
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How should it work in regards to possibility to insert illegal sgpr to vgpr copy and can it fail like register class version?
Or are we looking for something much simpler:
no reg bank -set reg bank
same reg bank - do nothing
different reg bank - insert copy
2124eb3
to
df50c85
Compare
921a702
to
db1cdae
Compare
df50c85
to
36c8a96
Compare
db1cdae
to
3370bba
Compare
36c8a96
to
69dde87
Compare
3370bba
to
3f085e7
Compare
Rebase for new-reg-bank-select taking over AMDGPURegBankSelect |
69dde87
to
9048f0d
Compare
3f085e7
to
f09e1dc
Compare
Lower G_ instructions that can't be inst-selected with register bank assignment from AMDGPURegBankSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. AMDGPURegBankLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - AMDGPURegBankLegalizeHelper implements lowering algorithms Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(AMDGPURegBankSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in AMDGPURegBankSelect via another fix in machine uniformity analysis.
9048f0d
to
d17ca95
Compare
f09e1dc
to
f7ee75a
Compare
Lower G_ instructions that can't be inst-selected with register bank
assignment from AMDGPURegBankSelect based on uniformity analysis.
Given LLTs on all operands after legalizer, some register bank
assignments require lowering while other do not.
Note: cases where all register bank assignments would require lowering
are lowered in legalizer.
AMDGPURegBankLegalize goals:
overview of how to lower generic instructions based on available
target features and uniformity info (uniform vs divergent).
all the same just for different combinations of types and banks.
Write custom function that handles all cases.
Names of IDs are meant to give brief description what lowering does
for each operand or the whole instruction.
Since this is the first patch that actually enables -new-reg-bank-select
here is the summary of regression tests that were added earlier:
G_TRUNC in reg is treated as no-op.
SALU instruction
is defined as uniform(AMDGPURegBankSelect) - missing temporal
divergence lowering
fixed in AMDGPURegBankSelect via another fix in machine uniformity
analysis.