Skip to content

Commit

Permalink
AMDGPU/GlobalISel: AMDGPURegBankSelect
Browse files Browse the repository at this point in the history
Assign register banks to virtual registers. Does not use generic
RegBankSelect. After register bank selection all register operand of
G_ instructions have LLT and register banks exclusively. If they had
register class, reassign appropriate register bank.

Assign register banks using machine uniformity analysis:
Sgpr - uniform values and some lane masks
Vgpr - divergent, non S1, values
Vcc  - divergent S1 values(lane masks)

AMDGPURegBankSelect does not consider available instructions and, in
some cases, G_ instructions with some register bank assignment can't be
inst-selected. This is solved in RegBankLegalize.

Exceptions when uniformity analysis does not work:
S32/S64 lane masks:
- need to end up with sgpr register class after instruction selection
- In most cases Uniformity analysis declares them as uniform
  (forced by tablegen) resulting in sgpr S32/S64 reg bank
- When Uniformity analysis declares them as divergent (some phis),
  use intrinsic lane mask analyzer to still assign sgpr register bank
temporal divergence copy:
- COPY to vgpr with implicit use of $exec inside of the cycle
- this copy is declared as uniform by uniformity analysis
- make sure that assigned bank is vgpr
Note: uniformity analysis does not consider that registers with vgpr def
are divergent (you can have uniform value in vgpr).
- TODO: implicit use of $exec could be implemented as indicator
  that instruction is divergent
  • Loading branch information
petar-avramovic committed Oct 30, 2024
1 parent 623266f commit d17ca95
Show file tree
Hide file tree
Showing 5 changed files with 990 additions and 685 deletions.
37 changes: 37 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
#include "AMDGPUGlobalISelUtils.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"

using namespace llvm;
using namespace AMDGPU;
using namespace MIPatternMatch;

std::pair<Register, unsigned>
Expand Down Expand Up @@ -69,3 +72,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,

return std::pair(Reg, 0);
}

IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
: MRI(MF.getRegInfo()) {
initLaneMaskIntrinsics(MF);
}

bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) {
return S32S64LaneMask.contains(Reg);
}

void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (isa<GIntrinsic>(MI) &&
cast<GIntrinsic>(MI).getIntrinsicID() == Intrinsic::amdgcn_if_break) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
findLCSSAPhi(MI.getOperand(0).getReg());
}

if (MI.getOpcode() == AMDGPU::SI_IF ||
MI.getOpcode() == AMDGPU::SI_ELSE) {
findLCSSAPhi(MI.getOperand(0).getReg());
}
}
}
}

void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(Reg);
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
if (LCSSAPhi.isPHI())
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H

#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>

Expand All @@ -26,6 +28,26 @@ std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
GISelKnownBits *KnownBits = nullptr,
bool CheckNUW = false);

// Currently finds S32/S64 lane masks that can be declared as divergent by
// uniformity analysis (all are phis at the moment).
// These are defined as i32/i64 in some IR intrinsics (not as i1).
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
// reg class after instruction-select don't search for all of them.
class IntrinsicLaneMaskAnalyzer {
SmallDenseSet<Register, 8> S32S64LaneMask;
MachineRegisterInfo &MRI;

public:
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
bool isS32S64LaneMask(Register Reg);

private:
void initLaneMaskIntrinsics(MachineFunction &MF);
// This will not be needed when we turn off LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
}
}

Expand Down
213 changes: 213 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-regbankselect"
Expand All @@ -40,6 +45,7 @@ class AMDGPURegBankSelect : public MachineFunctionPass {
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand All @@ -55,6 +61,7 @@ class AMDGPURegBankSelect : public MachineFunctionPass {

INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPURegBankSelect, DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)

Expand All @@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
return new AMDGPURegBankSelect();
}

class RegBankSelectHelper {
MachineIRBuilder &B;
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;

public:
RegBankSelectHelper(MachineIRBuilder &B,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}

bool shouldRegBankSelect(MachineInstr &MI) {
return MI.isPreISelOpcode() || MI.isCopy();
}

// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
// the cycle
// Note: uniformity analysis does not consider that registers with vgpr def
// are divergent (you can have uniform value in vgpr).
// - TODO: implicit use of $exec could be implemented as indicator that
// instruction is divergent
bool isTemporalDivergenceCopy(Register Reg) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (!MI->isCopy())
return false;

for (auto Op : MI->implicit_operands()) {
if (!Op.isReg())
continue;

if (Op.getReg() == TRI.getExec()) {
return true;
}
}

return false;
}

void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
const RegisterBank *RB) {
Register Reg = DefOP.getReg();
// Register that already has Register class got it during pre-inst selection
// of another instruction. Maybe cross bank copy was required so we insert a
// copy that can be removed later. This simplifies post regbanklegalize
// combiner and avoids need to special case some patterns.
if (MRI.getRegClassOrNull(Reg)) {
LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
DefOP.setReg(NewReg);

auto &MBB = *MI.getParent();
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
B.buildCopy(Reg, NewReg);

// The problem was discovered for uniform S1 that was used as both
// lane mask(vcc) and regular sgpr S1.
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
// - the regular sgpr S1(uniform) instruction is now broken since
// it uses sreg_64_xexec(S1) which is divergent.

// Replace virtual registers with register class on generic instructions
// uses with virtual registers with register bank.
for (auto &UseMI : MRI.use_instructions(Reg)) {
if (shouldRegBankSelect(UseMI)) {
for (MachineOperand &Op : UseMI.operands()) {
if (Op.isReg() && Op.getReg() == Reg)
Op.setReg(NewReg);
}
}
}

} else {
MRI.setRegBank(Reg, *RB);
}
}

std::optional<Register> tryGetVReg(MachineOperand &Op) {
if (!Op.isReg())
return std::nullopt;

Register Reg = Op.getReg();
if (!Reg.isVirtual())
return std::nullopt;

return Reg;
}

void assignBanksOnDefs(MachineInstr &MI) {
if (!shouldRegBankSelect(MI))
return;

for (MachineOperand &DefOP : MI.defs()) {
auto MaybeDefReg = tryGetVReg(DefOP);
if (!MaybeDefReg)
continue;
Register DefReg = *MaybeDefReg;

// Copies can have register class on def registers.
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
continue;
}

if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
setRBDef(MI, DefOP, SgprRB);
} else {
if (MRI.getType(DefReg) == LLT::scalar(1))
setRBDef(MI, DefOP, VccRB);
else
setRBDef(MI, DefOP, VgprRB);
}
}
}

void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP,
const RegisterBank *RB) {
Register Reg = UseOP.getReg();

LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
UseOP.setReg(NewReg);

if (MI.isPHI()) {
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
MachineBasicBlock *DefMBB = DefMI->getParent();
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
} else {
B.setInstr(MI);
}

B.buildCopy(NewReg, Reg);
}

void constrainBanksOnUses(MachineInstr &MI) {
if (!shouldRegBankSelect(MI))
return;

// Copies can have register class on use registers.
if (MI.isCopy())
return;

for (MachineOperand &UseOP : MI.uses()) {
auto MaybeUseReg = tryGetVReg(UseOP);
if (!MaybeUseReg)
continue;
Register UseReg = *MaybeUseReg;

// UseReg already has register bank.
if (MRI.getRegBankOrNull(UseReg))
continue;

if (!isTemporalDivergenceCopy(UseReg) &&
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
constrainRBUse(MI, UseOP, SgprRB);
} else {
if (MRI.getType(UseReg) == LLT::scalar(1))
constrainRBUse(MI, UseOP, VccRB);
else
constrainRBUse(MI, UseOP, VgprRB);
}
}
}
};

bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;

MachineIRBuilder B(MF);
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF);
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
*ST.getRegBankInfo());

// Assign register banks to ALL def registers on G_ instructions.
// Same for copies if they have no register bank or class on def.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
RBSHelper.assignBanksOnDefs(MI);
}
}

// At this point all virtual registers have register class or bank
// - Defs of G_ instructions have register banks.
// - Defs and uses of inst-selected instructions have register class.
// - Defs and uses of copies can have either register class or bank
// and most notably:
// - Uses of G_ instructions can have either register class or bank.

// Reassign uses of G_ instructions to only have register banks.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
RBSHelper.constrainBanksOnUses(MI);
}
}

// Defs and uses of G_ instructions have register banks exclusively.
return true;
}
Loading

0 comments on commit d17ca95

Please sign in to comment.