Skip to content

Commit

Permalink
AMDGPU/GlobalISel: StandaloneRegBankSelect
Browse files Browse the repository at this point in the history
Assign register banks to virtual registers. Does not use generic
RegBankSelect. Defs and uses of G_ instructions have register banks
exclusively, if they had register class, reassign appropriate register
bank.

Assign register banks using machine uniformity analysis:
SGPR - uniform values and some lane masks
VGPR - divergent, non S1, values
VCC  - divergent S1 values(lane masks)

StandaloneRegBankSelect does not consider available instructions and, in
some cases, G_ instructions with some register bank assignment can't be
inst-selected. This is solved in RegBankLegalize.

Exceptions when uniformity analysis does not work:
S32/S64 lane masks:
- need to end up with SGPR register class after instruction selection
- In most cases Uniformity analysis declares them as uniform
  (forced by tablegen) resulting in sgpr S32/S64 reg bank
- When Uniformity analysis declares them as divergent (some phis),
  use intrinsic lane mask analyzer to still assign sgpr register bank
temporal divergence copy:
- COPY to vgpr with implicit use of $exec inside of the cycle
- this copy is declared as uniform by uniformity analysis
- make sure that assigned bank is vgpr
Note: uniformity analysis does not consider that registers with vgpr def
are divergent (you can have uniform value in vgpr).
- TODO: implicit use of $exec could be implemented as indicator
  that instruction is divergent
  • Loading branch information
petar-avramovic committed Oct 22, 2024
1 parent 0c40f68 commit df50c85
Show file tree
Hide file tree
Showing 5 changed files with 996 additions and 685 deletions.
38 changes: 38 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
//===----------------------------------------------------------------------===//

#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"

using namespace llvm;
using namespace AMDGPU;
using namespace MIPatternMatch;

std::pair<Register, unsigned>
Expand Down Expand Up @@ -69,3 +73,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,

return std::pair(Reg, 0);
}

IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
: MRI(MF.getRegInfo()) {
initLaneMaskIntrinsics(MF);
}

bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) {
return S32S64LaneMask.contains(Reg);
}

void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (isa<GIntrinsic>(MI) &&
cast<GIntrinsic>(MI).getIntrinsicID() == Intrinsic::amdgcn_if_break) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
findLCSSAPhi(MI.getOperand(0).getReg());
}

if (MI.getOpcode() == AMDGPU::SI_IF ||
MI.getOpcode() == AMDGPU::SI_ELSE) {
findLCSSAPhi(MI.getOperand(0).getReg());
}
}
}
}

void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(Reg);
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
if (LCSSAPhi.isPHI())
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H

#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>

Expand All @@ -26,6 +28,26 @@ std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
GISelKnownBits *KnownBits = nullptr,
bool CheckNUW = false);

// Currently finds S32/S64 lane masks that can be declared as divergent by
// uniformity analysis (all are phis at the moment).
// These are defined as i32/i64 in some IR intrinsics (not as i1).
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
// reg class after instruction-select don't search for all of them.
class IntrinsicLaneMaskAnalyzer {
DenseSet<Register> S32S64LaneMask;
MachineRegisterInfo &MRI;

public:
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
bool isS32S64LaneMask(Register Reg);

private:
void initLaneMaskIntrinsics(MachineFunction &MF);
// This will not be needed when we turn off LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
}
}

Expand Down
218 changes: 218 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUStandaloneRegBankSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-standalone-regbankselect"
Expand All @@ -41,6 +47,7 @@ class AMDGPUStandaloneRegBankSelect : public MachineFunctionPass {
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand Down Expand Up @@ -68,9 +75,220 @@ FunctionPass *llvm::createAMDGPUStandaloneRegBankSelectPass() {
return new AMDGPUStandaloneRegBankSelect();
}

class RegBankSelectHelper {
MachineFunction &MF;
MachineIRBuilder &B;
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;

public:
RegBankSelectHelper(MachineFunction &MF, MachineIRBuilder &B,
MachineRegisterInfo &MRI,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
: MF(MF), B(B), MRI(MRI), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}

bool shouldRegBankSelect(MachineInstr &MI) {
return MI.isPreISelOpcode() || MI.isCopy();
}

void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
const RegisterBank *RB) {
Register Reg = DefOP.getReg();
// Register that already has Register class got it during pre-inst selection
// of another instruction. Maybe cross bank copy was required so we insert a
// copy that can be removed later. This simplifies post-rb-legalize artifact
// combiner and avoids need to special case some patterns.
if (MRI.getRegClassOrNull(Reg)) {
LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
DefOP.setReg(NewReg);

auto &MBB = *MI.getParent();
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
B.buildCopy(Reg, NewReg);

// The problem was discovered for uniform S1 that was used as both
// lane mask(vcc) and regular sgpr S1.
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
// - the regular sgpr S1(uniform) instruction is now broken since
// it uses sreg_64_xexec(S1) which is divergent.

// "Clear" reg classes from uses on generic instructions and put register
// banks instead.
for (auto &UseMI : MRI.use_instructions(Reg)) {
if (shouldRegBankSelect(UseMI)) {
for (MachineOperand &Op : UseMI.operands()) {
if (Op.isReg() && Op.getReg() == Reg)
Op.setReg(NewReg);
}
}
}

} else {
MRI.setRegBank(Reg, *RB);
}
}

void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP,
const RegisterBank *RB) {
Register Reg = UseOP.getReg();

LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({RB, Ty});
UseOP.setReg(NewReg);

if (MI.isPHI()) {
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
MachineBasicBlock *DefMBB = DefMI->getParent();
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
} else {
B.setInstr(MI);
}

B.buildCopy(NewReg, Reg);
}

std::optional<Register> tryGetVReg(MachineOperand &Op) {
if (!Op.isReg())
return std::nullopt;

Register Reg = Op.getReg();
if (!Reg.isVirtual())
return std::nullopt;

return Reg;
}

void assignBanksOnDefs() {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!shouldRegBankSelect(MI))
continue;

for (MachineOperand &DefOP : MI.defs()) {
auto MaybeDefReg = tryGetVReg(DefOP);
if (!MaybeDefReg)
continue;
Register DefReg = *MaybeDefReg;

// Copies can have register class on def registers.
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
continue;
}

if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
setRBDef(MI, DefOP, SgprRB);
} else {
if (MRI.getType(DefReg) == LLT::scalar(1))
setRBDef(MI, DefOP, VccRB);
else
setRBDef(MI, DefOP, VgprRB);
}
}
}
}
}

// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
// the cycle
// Note: uniformity analysis does not consider that registers with vgpr def
// are divergent (you can have uniform value in vgpr).
// - TODO: implicit use of $exec could be implemented as indicator that
// instruction is divergent
bool isTemporalDivergenceCopy(Register Reg) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (!MI->isCopy())
return false;

for (auto Op : MI->implicit_operands()) {
if (!Op.isReg())
continue;

if (Op.getReg() == TRI.getExec()) {
return true;
}
}

return false;
}

void constrainBanksOnUses() {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!shouldRegBankSelect(MI))
continue;

// Copies can have register class on use registers.
if (MI.isCopy())
continue;

for (MachineOperand &UseOP : MI.uses()) {
auto MaybeUseReg = tryGetVReg(UseOP);
if (!MaybeUseReg)
continue;
Register UseReg = *MaybeUseReg;

// UseReg already has register bank.
if (MRI.getRegBankOrNull(UseReg))
continue;

if (!isTemporalDivergenceCopy(UseReg) &&
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
constrainRBUse(MI, UseOP, SgprRB);
} else {
if (MRI.getType(UseReg) == LLT::scalar(1))
constrainRBUse(MI, UseOP, VccRB);
else
constrainRBUse(MI, UseOP, VgprRB);
}
}
}
}
}
};

bool AMDGPUStandaloneRegBankSelect::runOnMachineFunction(MachineFunction &MF) {
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;

MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF);
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo &TRI =
*MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();

MachineIRBuilder B(MF);
RegBankSelectHelper RBSHelper(MF, B, MRI, ILMA, MUI, TRI, RBI);

// Assign register banks to ALL def registers on G_ instructions.
// Same for copies if they have no register bank or class on def.
RBSHelper.assignBanksOnDefs();

// At this point all virtual registers have register class or bank
// - Defs of G_ instructions have register banks.
// - Defs and uses of inst-selected instructions have register class.
// - Defs and uses of copies can have either register class or bank
// and most notably
// - Uses of G_ instructions can have either register class or bank

// Reassign uses of G_ instructions to only have register banks.
RBSHelper.constrainBanksOnUses();

// Defs and uses of G_ instructions have register banks exclusively.
return true;
}
Loading

0 comments on commit df50c85

Please sign in to comment.